From 55369ec10298dc6f7f6c0e5518e2f521e6616a08 Mon Sep 17 00:00:00 2001 From: Brian Middendorf Date: Mon, 9 Mar 2026 17:23:34 -0500 Subject: [PATCH 1/2] purview-v1.10.8 --- ...AX_Purview_Audit_Log_Processor_v1.10.8.ps1 | 68 +- ...dit_Log_Processor_Documentation_v1.10.0.md | 13 +- ...udit_Log_Processor_Release_Note_v1.10.0.md | 8 +- ...AX_Purview_Audit_Log_Processor_v1.10.8.ps1 | 17341 ++++++++++++++++ 4 files changed, 17409 insertions(+), 21 deletions(-) rename PAX_Purview_Audit_Log_Processor_v1.10.7.ps1 => PAX_Purview_Audit_Log_Processor_v1.10.8.ps1 (99%) create mode 100644 script_archive/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_v1.10.8.ps1 diff --git a/PAX_Purview_Audit_Log_Processor_v1.10.7.ps1 b/PAX_Purview_Audit_Log_Processor_v1.10.8.ps1 similarity index 99% rename from PAX_Purview_Audit_Log_Processor_v1.10.7.ps1 rename to PAX_Purview_Audit_Log_Processor_v1.10.8.ps1 index 74e15b6..0eab726 100644 --- a/PAX_Purview_Audit_Log_Processor_v1.10.7.ps1 +++ b/PAX_Purview_Audit_Log_Processor_v1.10.8.ps1 @@ -1,5 +1,5 @@ # Portable Audit eXporter (PAX) - Purview Audit Log Processor -# Version: v1.10.7 +# Version: v1.10.8 # Default Activity Type: CopilotInteraction (captures ALL M365 Copilot usage including all M365 apps and Teams meetings) # DSPM for AI: Microsoft Purview Data Security Posture Management integration # MIXED FREE/PAYG Activity Types: AIInteraction (currently Microsoft platforms only), ConnectedAIAppInteraction (Microsoft + third-party) @@ -42,6 +42,8 @@ - Partial success: Continues processing with successful partitions if some fail - Query naming: PAX_Query__PartX/Total visible in Purview UI - Unified concurrency: MaxConcurrency parameter controls both EOM and Graph API modes (default: 10) + - Date-range accuracy: Client-side trimming ensures output contains only records within + [StartDate, EndDate), compensating for Purview API date-range bleed - Checkpoint & Resume: All auth modes automatically save progress to checkpoint files, enabling resumption after Ctrl+C, network failures, or any interruption via -Resume @@ -750,13 +752,13 @@ Forms, Stream, Planner, PowerApps, and Office desktop apps. ACTIVITY TYPES INCLUDED: - Exchange: MailboxLogin, MailItemsAccessed, Send, SendOnBehalf, SoftDelete, HardDelete, + Exchange: MailItemsAccessed, Send, SendOnBehalf, SoftDelete, HardDelete, MoveToDeletedItems, CopyToFolder SharePoint/OneDrive (Files): FileAccessed, FileDownloaded, FileUploaded, FileModified, FileDeleted, FileMoved, FileCheckedIn, FileCheckedOut, FileRecycled, FileRestored, FileVersionsAllDeleted - SharePoint/OneDrive (Sharing): SharingSet, SharingInvitationCreated, SharingInvitationAccepted, - SharedLinkCreated, SharingRevoked, AddedToSecureLink, RemovedFromSecureLink, SecureLinkUsed + SharePoint/OneDrive (Sharing): SharingInvitationCreated, SharingInvitationAccepted, + SharedLinkCreated, SharingRevoked, RemovedFromSecureLink Groups: AddMemberToUnifiedGroup, RemoveMemberFromUnifiedGroup Teams (Team/Channel): TeamCreated, TeamDeleted, TeamArchived, TeamSettingChanged, TeamMemberAdded, TeamMemberRemoved, MemberAdded, MemberRemoved, MemberRoleChanged, @@ -1678,9 +1680,9 @@ $recordTypeWorkloadMap = @{ $serviceOperationMap = @{ 'AzureActiveDirectory' = @('UserLoggedIn','UserLoginFailed','AdminLoggedIn','ResetUserPassword','AddRegisteredUser','UpdateUser','ChangedUserSetting') - 'Exchange' = @('MailboxLogin','MailItemsAccessed','Send','SendOnBehalf','SoftDelete','HardDelete','MoveToDeletedItems','CopyToFolder','NewInboxRule','UpdateInboxRules','AddMailboxPermission','RemoveMailboxPermission') - 'SharePoint' = @('FileAccessed','FileDownloaded','FileUploaded','FileModified','FileDeleted','FileMoved','SharingSet','SharingInvitationCreated','SharingInvitationAccepted','SharedLinkCreated','SharingRevoked','AddMemberToUnifiedGroup','RemoveMemberFromUnifiedGroup') - 'OneDrive' = @('FileAccessed','FileDownloaded','FileUploaded','FileModified','FileDeleted','FileMoved','SharingSet','SharingInvitationCreated','SharingInvitationAccepted','SharedLinkCreated','SharingRevoked','AddMemberToUnifiedGroup','RemoveMemberFromUnifiedGroup') + 'Exchange' = @('MailItemsAccessed','Send','SendOnBehalf','SoftDelete','HardDelete','MoveToDeletedItems','CopyToFolder','AddMailboxPermission','RemoveMailboxPermission') + 'SharePoint' = @('FileAccessed','FileDownloaded','FileUploaded','FileModified','FileDeleted','FileMoved','SharingInvitationCreated','SharingInvitationAccepted','SharedLinkCreated','SharingRevoked','AddMemberToUnifiedGroup','RemoveMemberFromUnifiedGroup') + 'OneDrive' = @('FileAccessed','FileDownloaded','FileUploaded','FileModified','FileDeleted','FileMoved','SharingInvitationCreated','SharingInvitationAccepted','SharedLinkCreated','SharingRevoked','AddMemberToUnifiedGroup','RemoveMemberFromUnifiedGroup') 'Teams' = @('TeamMemberAdded','TeamMemberRemoved','ChannelAdded','ChannelDeleted','ChannelMessageSent','ChannelMessageDeleted','TeamDeleted','TeamArchived','AddMemberToUnifiedGroup','RemoveMemberFromUnifiedGroup') 'MicrosoftForms' = @('CreateForm','EditForm','DeleteForm','ViewForm','CreateResponse','SubmitResponse','ViewResponse','DeleteResponse') 'MicrosoftStream' = @('StreamModified','StreamViewed','StreamDeleted','StreamDownloaded') @@ -1694,15 +1696,15 @@ $m365UsageRecordBundle = @('ExchangeAdmin','ExchangeItem','ExchangeMailbox','Sha # Curated M365 usage operations spanning Exchange/SharePoint/OneDrive/Teams/Forms/Stream/Planner/PowerApps and Office desktop apps (Word/Excel/PowerPoint/OneNote) $m365UsageActivityBundle = @( # === Exchange/Email === - 'MailboxLogin','MailItemsAccessed','Send','SendOnBehalf','SoftDelete','HardDelete','MoveToDeletedItems','CopyToFolder', + 'MailItemsAccessed','Send','SendOnBehalf','SoftDelete','HardDelete','MoveToDeletedItems','CopyToFolder', # === SharePoint/OneDrive - Files === 'FileAccessed','FileDownloaded','FileUploaded','FileModified','FileDeleted','FileMoved', 'FileCheckedIn','FileCheckedOut','FileRecycled','FileRestored','FileVersionsAllDeleted', # === SharePoint/OneDrive - Sharing === - 'SharingSet','SharingInvitationCreated','SharingInvitationAccepted','SharedLinkCreated','SharingRevoked', - 'AddedToSecureLink','RemovedFromSecureLink','SecureLinkUsed', + 'SharingInvitationCreated','SharingInvitationAccepted','SharedLinkCreated','SharingRevoked', + 'RemovedFromSecureLink', # === Groups/Unified Groups === 'AddMemberToUnifiedGroup','RemoveMemberFromUnifiedGroup', @@ -1757,7 +1759,7 @@ $m365UsageActivityBundle = @( ) | Select-Object -Unique # Script version constant (must appear after param/help to keep param() valid as first executable block) -$ScriptVersion = '1.10.7' +$ScriptVersion = '1.10.8' # --- Initialize/Clear persistent script variables to prevent cross-run contamination --- # Note: Script-scoped variables persist across multiple script invocations in the same PowerShell session @@ -2439,6 +2441,15 @@ else { } } +# Client-side date-range trim boundaries — Purview's partition-based indexing can +# return records outside the requested date range (observed up to ~10 h past EndDate). +# These UTC boundaries are used after dedup to trim any out-of-range records. +# SpecifyKind(Utc) is critical: ParseExact returns Kind=Unspecified, and .ToUniversalTime() +# on Unspecified assumes LOCAL time, shifting the boundary by the machine's UTC offset. +$script:TrimStartDateUTC = if ($StartDate -ne '*') { [datetime]::SpecifyKind([datetime]::ParseExact($StartDate, 'yyyy-MM-dd', $null), [System.DateTimeKind]::Utc) } else { $null } +$script:TrimEndDateUTC = if ($EndDate -ne '*') { [datetime]::SpecifyKind([datetime]::ParseExact($EndDate, 'yyyy-MM-dd', $null), [System.DateTimeKind]::Utc) } else { $null } +$script:DateTrimCount = 0 + if ($BlockHours -le 0) { Write-Host "ERROR: BlockHours must be positive." -ForegroundColor Red; exit 1 } try { if ($PSVersionTable.PSEdition -eq 'Core' -and ($global:InformationPreference -in @('SilentlyContinue', 'Ignore'))) { $global:InformationPreference = 'Continue' } } catch {} @@ -4802,6 +4813,16 @@ function Merge-IncrementalSaves-Streaming { } if ($recordId) { [void]$seenIds.Add($recordId) } + # Client-side date-range trimming for streaming path + if ($script:TrimStartDateUTC -or $script:TrimEndDateUTC) { + $recDate = script:Parse-DateSafe $record.CreationDate + if ($recDate) { + $recDateUtc = $recDate.ToUniversalTime() + if ($script:TrimStartDateUTC -and $recDateUtc -lt $script:TrimStartDateUTC) { $script:DateTrimCount++; continue } + if ($script:TrimEndDateUTC -and $recDateUtc -ge $script:TrimEndDateUTC) { $script:DateTrimCount++; continue } + } + } + # Parse AuditData for Operation if needed $auditData = $record.AuditData $parsedAudit = if ($record.PSObject.Properties['_ParsedAuditData']) { @@ -15127,6 +15148,27 @@ Write-Output "[403-MAX] Partition $idx/$tot - Max transient 403 poll retries exc } } + # Client-side date-range trimming — remove records returned outside the requested date boundaries. + # Purview's partition-based indexing can bleed records up to ~10 hours past EndDate on large tenants. + if (($script:TrimStartDateUTC -or $script:TrimEndDateUTC) -and $allLogs.Count -gt 0) { + $preTrimCount = $allLogs.Count + $trimmedLogs = New-Object System.Collections.ArrayList($preTrimCount) + foreach ($log in $allLogs) { + $cd = script:Parse-DateSafe $log.CreationDate + if (-not $cd) { [void]$trimmedLogs.Add($log); continue } # Keep records with unparseable dates + $cdUtc = $cd.ToUniversalTime() + if ($script:TrimStartDateUTC -and $cdUtc -lt $script:TrimStartDateUTC) { continue } + if ($script:TrimEndDateUTC -and $cdUtc -ge $script:TrimEndDateUTC) { continue } + [void]$trimmedLogs.Add($log) + } + $trimCount = $preTrimCount - $trimmedLogs.Count + if ($trimCount -gt 0) { + $script:DateTrimCount += $trimCount + $allLogs = $trimmedLogs + Write-LogHost "Date-range trim: Removed $trimCount record(s) outside requested date boundaries" -ForegroundColor Yellow + } + } + # Show accurate record count — in streaming mode allLogs may be empty because records went JSONL→CSV directly if ($script:UseStreamingMergeForExport) { # StreamingMergeRecordCount = memory flush fresh run; mergedFromIncremental = deferred resume merge @@ -16973,6 +17015,10 @@ function Profile-AuditData { param([object]$AuditData) } # No-op stub for thread if ($script:StreamingMergeDuplicatesSkipped -gt 0) { Write-LogHost " Deduped: $($script:StreamingMergeDuplicatesSkipped) duplicate records removed" -ForegroundColor DarkGray } + # Show date-range trim count in Pipeline Summary + if ($script:DateTrimCount -gt 0) { + Write-LogHost " Trimmed: $($script:DateTrimCount) record(s) outside requested date range" -ForegroundColor DarkGray + } # Show data loss warning in Pipeline Summary if partitions were missing if ($script:StreamingMergeDataLoss) { Write-LogHost " [DATA-LOSS] WARNING: Output is PARTIAL — missing partitions: $($script:StreamingMergeMissingPartitions -join ', ')" -ForegroundColor Yellow diff --git a/release_documentation/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_Documentation_v1.10.0.md b/release_documentation/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_Documentation_v1.10.0.md index b6d820a..383189d 100644 --- a/release_documentation/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_Documentation_v1.10.0.md +++ b/release_documentation/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_Documentation_v1.10.0.md @@ -1,6 +1,6 @@ # Portable Audit eXporter (PAX) -
Purview Audit Log Processor -> **📥 Quick Start:** Download the script → [`PAX_Purview_Audit_Log_Processor_v1.10.7.ps1`](https://github.com/microsoft/PAX/releases/download/purview-v1.10.7/PAX_Purview_Audit_Log_Processor_v1.10.7.ps1) +> **📥 Quick Start:** Download the script → [`PAX_Purview_Audit_Log_Processor_v1.10.8.ps1`](https://github.com/microsoft/PAX/releases/download/purview-v1.10.8/PAX_Purview_Audit_Log_Processor_v1.10.8.ps1) > > **📋 Release Notes:** See what's new → [v1.10.x Release Notes](https://github.com/microsoft/PAX/blob/release/release_notes/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_Release_Note_v1.10.0.md) | [All Release Notes](https://github.com/microsoft/PAX/tree/release/release_notes/Purview_Audit_Log_Processor) > @@ -8,7 +8,7 @@ > > **📚 Documentation Archive:** [v1.10.x Documentation](https://github.com/microsoft/PAX/blob/release/release_documentation/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_Documentation_v1.10.0.md) | [All Documentation](https://github.com/microsoft/PAX/tree/release/release_documentation/Purview_Audit_Log_Processor) -**Script:** [`PAX_Purview_Audit_Log_Processor_v1.10.7.ps1`](https://github.com/microsoft/PAX/releases/download/purview-v1.10.7/PAX_Purview_Audit_Log_Processor_v1.10.7.ps1) +**Script:** [`PAX_Purview_Audit_Log_Processor_v1.10.8.ps1`](https://github.com/microsoft/PAX/releases/download/purview-v1.10.8/PAX_Purview_Audit_Log_Processor_v1.10.8.ps1) **Documentation Version:** 1.10.x **Audience:** IT admins, security/compliance analysts, BI/data teams **Runtime:** PowerShell 5.1 (compatible) / PowerShell 7+ (recommended) @@ -409,6 +409,7 @@ powershell -ExecutionPolicy Bypass -File .\PAX_Purview_Audit_Log_Processor.ps1 - - **Live mode:** Must specify both or neither (partial specification rejected) - **Replay mode:** Both dates optional; act as filters on `CreationDate` column - **Time zone:** All dates interpreted as UTC; convert local times before invocation +- **Date-range accuracy:** The Purview API may return records slightly beyond the requested `EndDate`. PAX automatically trims these so the exported output contains only records within your specified `[StartDate, EndDate)` range. The number of trimmed records is reported in the log file's Pipeline Summary. --- @@ -924,9 +925,9 @@ Automating scripts, using headless terminals, or SSO scenarios | Category | Operations | |----------|------------| -| Outlook (Exchange) | MailboxLogin, MailItemsAccessed, Send, SendOnBehalf, SoftDelete, HardDelete, MoveToDeletedItems, CopyToFolder | +| Outlook (Exchange) | MailItemsAccessed, Send, SendOnBehalf, SoftDelete, HardDelete, MoveToDeletedItems, CopyToFolder | | SharePoint/OneDrive (Files) | FileAccessed, FileDownloaded, FileUploaded, FileModified, FileDeleted, FileMoved, FileCheckedIn, FileCheckedOut, FileRecycled, FileRestored, FileVersionsAllDeleted | -| SharePoint/OneDrive (Sharing) | SharingSet, SharingInvitationCreated, SharingInvitationAccepted, SharedLinkCreated, SharingRevoked, AddedToSecureLink, RemovedFromSecureLink, SecureLinkUsed | +| SharePoint/OneDrive (Sharing) | SharingInvitationCreated, SharingInvitationAccepted, SharedLinkCreated, SharingRevoked, RemovedFromSecureLink | | Groups | AddMemberToUnifiedGroup, RemoveMemberFromUnifiedGroup | | Teams (Team/Channel) | TeamCreated, TeamDeleted, TeamArchived, TeamSettingChanged, TeamMemberAdded, TeamMemberRemoved, MemberAdded, MemberRemoved, MemberRoleChanged, ChannelAdded, ChannelDeleted, ChannelSettingChanged, ChannelOwnerResponded, ChannelMessageSent, ChannelMessageDeleted, BotAddedToTeam, BotRemovedFromTeam, TabAdded, TabRemoved, TabUpdated, ConnectorAdded, ConnectorRemoved, ConnectorUpdated | | Teams (Chat/Messaging) | TeamsSessionStarted, ChatCreated, ChatRetrieved, ChatUpdated, MessageSent, MessageRead, MessageDeleted, MessageUpdated, MessagesListed, MessageCreation, MessageCreatedHasLink, MessageEditedHasLink, MessageHostedContentRead, MessageHostedContentsListed, SensitiveContentShared | @@ -4471,7 +4472,6 @@ The bundle includes activities from 10 categories: | Operation | Description | |-----------|-------------| -| MailboxLogin | User accessed mailbox | | MailItemsAccessed | Email items accessed (read/preview) | | Send | Email sent | | SendOnBehalf | Email sent on behalf of another user | @@ -4500,14 +4500,11 @@ The bundle includes activities from 10 categories: | Operation | Description | |-----------|-------------| -| SharingSet | Sharing permissions set | | SharingInvitationCreated | Sharing invitation created | | SharingInvitationAccepted | Sharing invitation accepted | | SharedLinkCreated | Shared link created | | SharingRevoked | Sharing permissions revoked | -| AddedToSecureLink | User added to secure link | | RemovedFromSecureLink | User removed from secure link | -| SecureLinkUsed | Secure link accessed | #### Groups diff --git a/release_notes/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_Release_Note_v1.10.0.md b/release_notes/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_Release_Note_v1.10.0.md index 889ec1c..f94b124 100644 --- a/release_notes/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_Release_Note_v1.10.0.md +++ b/release_notes/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_Release_Note_v1.10.0.md @@ -3,7 +3,7 @@ ## Release Information - **Version:** 1.10.x -- **Release Date:** 2026-03-05 +- **Release Date:** 2026-03-09 - **Released By:** Microsoft Copilot Growth ROI Advisory Team (copilot-roi-advisory-team-gh@microsoft.com) --- @@ -12,7 +12,7 @@ Download the script below. For questions or issues, refer to the documentation. -- **PAX Purview Audit Log Processor Script v1.10.7:** [PAX_Purview_Audit_Log_Processor_v1.10.7.ps1](https://github.com/microsoft/PAX/releases/download/purview-v1.10.7/PAX_Purview_Audit_Log_Processor_v1.10.7.ps1) +- **PAX Purview Audit Log Processor Script v1.10.8:** [PAX_Purview_Audit_Log_Processor_v1.10.8.ps1](https://github.com/microsoft/PAX/releases/download/purview-v1.10.8/PAX_Purview_Audit_Log_Processor_v1.10.8.ps1) - **Documentation v1.10.x (Markdown):** [PAX_Purview_Audit_Log_Processor_Documentation_v1.10.x.md](https://github.com/microsoft/PAX/blob/release/release_documentation/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_Documentation_v1.10.0.md) --- @@ -511,6 +511,10 @@ If minimum window reached: - **(v1.10.7) Checkpoint log file rename in split mode:** Fixed `_PARTIAL` suffix remaining on the log file after a fully successful export when CSV split mode (`-SplitByActivityType` or `-SplitByRecordType`) is active. The fallback rename is guarded to preserve `_PARTIAL` on genuinely interrupted runs for Resume mode detection. +- **(v1.10.8) Purview date-range bleed — client-side trimming:** Added client-side date-range trimming to eliminate records that bleed past the requested `EndDate` boundary by up to ~10 hours (a known Purview API behavior affecting ~3–5% of returned records). Output CSV is now guaranteed to contain only records within the user-specified `[StartDate, EndDate)` range. Timezone-safe boundary parsing uses `SpecifyKind(..., Utc)` to prevent local timezone offsets from shifting UTC midnight boundaries. Trimmed record count is reported in the Pipeline Summary for operator visibility. + +- **(v1.10.8) M365 Usage Bundle — noisy operations removed:** Removed six high-volume, low-signal operations from the `-IncludeM365Usage` bundle: `MailboxLogin`, `SharingSet`, `AddedToSecureLink`, `SecureLinkUsed`, `NewInboxRule`, and `UpdateInboxRules`. Bundle size reduced from ~121 to ~117 curated operation types. All removed operations remain available for explicit queries via `-ActivityTypes`. + --- ## Known Considerations diff --git a/script_archive/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_v1.10.8.ps1 b/script_archive/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_v1.10.8.ps1 new file mode 100644 index 0000000..0eab726 --- /dev/null +++ b/script_archive/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_v1.10.8.ps1 @@ -0,0 +1,17341 @@ +# Portable Audit eXporter (PAX) - Purview Audit Log Processor +# Version: v1.10.8 +# Default Activity Type: CopilotInteraction (captures ALL M365 Copilot usage including all M365 apps and Teams meetings) +# DSPM for AI: Microsoft Purview Data Security Posture Management integration +# MIXED FREE/PAYG Activity Types: AIInteraction (currently Microsoft platforms only), ConnectedAIAppInteraction (Microsoft + third-party) +# PAYG Activity Types: AIAppInteraction (third-party AI via network DLP) +# ---PAYG only applies to third-party AI apps/agents audit records and never applies to any audit records generated by Microsoft AI apps/agents +# NOTE: Graph API mode automatically detects v1.0 (GA) or beta endpoints at runtime (no config needed) +# NOTE: Uses operationFilters with operation names for ALL activity types (e.g., "CopilotInteraction", "AIInteraction") +# See: https://learn.microsoft.com/en-us/office/office-365-management-api/office-365-management-activity-api-schema +<# +.SYNOPSIS + Export Microsoft Purview audit logs for Microsoft 365 Copilot and DSPM for AI activity types with optional Purview-aligned row explosion and deep flattening. + +.DESCRIPTION + Modes: + Standard - One row per audit record (raw CopilotEventData JSON preserved) + -ExplodeArrays - Produces canonical Purview exploded schema (35 fixed columns) + -ExplodeDeep - Same 35-column Purview schema + appended deep-flattened CopilotEventData.* columns + + Graph API Version Configuration: + - PAX automatically detects the correct Graph API security/auditLog endpoint version + - Configurable version variables near top of script (lines ~1519-1527): + $script:GraphAuditApiVersion_Current = 'v1.0' # Try this version first + $script:GraphAuditApiVersion_Previous = 'beta' # Fallback if current unavailable + - Detection occurs once per session and is reused automatically + - Single-line terminal output shows which version is active + - Manual override: Edit the version variables if Microsoft releases new API versions + - Default: Tries v1.0 first (expected GA Q1 2026), falls back to beta if unavailable + - No command-line switches needed - fully automatic with manual override capability + + Parallel Explosion Processing (PS7+ only): + - After data retrieval, explosion of records into rows can be parallelized + - Automatic on PS7+ with >500 records (uses job queue with ~1000 records per chunk) + - Control via -ExplosionThreads: 0=auto (2-16 threads), 1=serial, 2-32=explicit + - Output is identical to serial mode (same columns, data, row count; only order may differ) + - Falls back to serial processing on PowerShell 5.1 + + Reliability & Resilience: + - Automatic retry logic: Up to 3 attempts per partition with smart cooldown + - End-of-run summary: Shows Complete/Incomplete/Failed partitions with QueryIds + - Partial success: Continues processing with successful partitions if some fail + - Query naming: PAX_Query__PartX/Total visible in Purview UI + - Unified concurrency: MaxConcurrency parameter controls both EOM and Graph API modes (default: 10) + - Date-range accuracy: Client-side trimming ensures output contains only records within + [StartDate, EndDate), compensating for Purview API date-range bleed + - Checkpoint & Resume: All auth modes automatically save progress to checkpoint files, + enabling resumption after Ctrl+C, network failures, or any interruption via -Resume + + Offline Replay (-RAWInputCSV): + * Ingest a previously exported raw Purview audit CSV (must contain AuditData JSON column) + * Skips authentication & live Search-UnifiedAuditLog queries entirely + * Forces at least Purview array explosion even if -ExplodeArrays not supplied + * Optional -ExplodeDeep further deep‑flattens CopilotEventData.* + * Allows only filtering parameters (StartDate / EndDate / ActivityTypes / AgentId / AgentsOnly / PromptFilter / ExcludeAgents / UserIds) plus OutputFile, AppendFile & explosion switches + * Disallowed with RAWInputCSV (error if present): BlockHours, ResultSize, PacingMs, Auth, ParallelMode, MaxParallelGroups, MaxConcurrency, EnableParallel, GroupNames + * StartDate / EndDate act as inclusive(lower)/exclusive(upper) UTC filters on CreationDate in the replay dataset + * ActivityTypes filters by Operation (case‑insensitive membership) + * AgentId filters for specific AgentId value(s); AgentsOnly includes any record with an AgentId present + * PromptFilter filters messages by isPrompt property (Prompt/Response/Both/Null) + * ExcludeAgents removes records with AgentId present (inverse of AgentsOnly) + * UserIds filters by UserId extracted from AuditData JSON (client-side filtering) + * GroupNames is NOT supported in replay mode (requires authentication for group expansion) + * Non‑exploded 1:1 mode is intentionally disabled for deterministic schema in offline transforms + + Filtering: + -AgentId : Filter to records matching specific AgentId value(s) + -AgentsOnly : Filter to records with any AgentId present (mutually exclusive with -ExcludeAgents) + -ExcludeAgents : Filter to records WITHOUT AgentId (mutually exclusive with -AgentId/-AgentsOnly) + -PromptFilter + Prompt : Only export messages where Message_isPrompt = True + Response : Only export messages where Message_isPrompt = False + Both : Export messages with either True or False isPrompt values + Null : Only export messages with null/undefined isPrompt values (rare) + Note: PromptFilter uses two-stage filtering for optimal performance: + Stage 1 (Pre-filter): Filters records before explosion based on message content + Stage 2 (Message-level): Filters individual messages during explosion + + -UserIds : Filter to specific user identifier(s) + LIVE MODE: SERVER-SIDE filtering at Purview (efficient, no unnecessary data transfer) + REPLAY MODE: CLIENT-SIDE filtering by extracting UserId from AuditData JSON (slower but functional) + + Accepted formats: + • User Principal Name (UPN): "john.doe@contoso.com" + • SMTP Address: "john.doe@contoso.com" + • User GUID: "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + Examples: + -UserIds "john.doe@contoso.com" + -UserIds "john.doe@contoso.com","jane.smith@contoso.com","bob.jones@contoso.com" + + -GroupNames : Filter to members of distribution/security group(s) + LIVE MODE ONLY: Groups automatically expanded to individual users after authentication using Get-DistributionGroupMember + REPLAY MODE: NOT SUPPORTED (requires authentication) - use -UserIds with explicit email addresses instead + + Accepted formats (LIVE MODE only): + • Group Display Name: "Executive Leadership Team" + • Group Email (Alias): "exec-team@contoso.com" + • Group GUID: "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + • Distinguished Name: "CN=ExecTeam,OU=Groups,DC=contoso,DC=com" + Examples: + -GroupNames "Executive Leadership" + -GroupNames "exec-team@contoso.com" + -GroupNames "Engineering Managers","Product Leads","Sales Directors" + Note: Groups are expanded once after authentication + Blocked in replay mode (-RAWInputCSV) - script will exit with error + + Combining UserIds + GroupNames (LIVE MODE ONLY): + • When both are specified, the script combines and deduplicates the user lists + • Example: -UserIds "ceo@contoso.com" -GroupNames "Board of Directors" + Pulls records for the CEO plus all expanded board members (duplicates removed) + • Not available in replay mode - use -UserIds only + + COMBINING FILTERS - Powerful Use Cases: + + All filters can be combined for highly targeted data extraction. Filter application order is now CONSISTENT across both modes: + + FILTER APPLICATION ORDER (BOTH MODES): + 1. User/Group filtering (server-side in live mode via -UserIds, client-side in replay mode) + 2. Agent filtering (AgentsOnly, AgentId, or ExcludeAgents) + 3. PromptFilter (during explosion: Prompt, Response, Both, or Null) + + NOTE: Applying User/Group filtering first improves performance by reducing the dataset before subsequent filters. + + TWO-FILTER COMBINATIONS: + + User + Agent: + Use Case: Analyze specific user(s) interactions with Copilot agents + Example: "Show me all agent usage by our power users" + Command: -UserIds "poweruser@contoso.com" -AgentsOnly + + User + PromptFilter: + Use Case: Focus on conversation patterns (prompts/responses) for specific users + Example: "Show me only the questions asked by the executive team" + Command: -GroupNames "Executive Team" -PromptFilter Prompt + Result: Removes resource-only explosion rows, keeps only message data + + Agent + PromptFilter: + Use Case: Analyze agent conversation quality, prompt engineering effectiveness + Example: "Show me all prompts sent to our custom declarative agent" + Command: -AgentId "CopilotStudio.Declarative.abc123" -PromptFilter Prompt + + THREE-FILTER COMBINATION: + + User + Agent + PromptFilter: + Use Case: Deep-dive conversation analysis for specific users with agents + Example: "Show me all questions the sales team asked our Sales Copilot agent" + Command: -GroupNames "Sales Team" -AgentId "SalesCopilot.Agent" -PromptFilter Prompt + Benefits: + • Server-side filtering reduces data transfer (live mode) + • Agent filter removes non-agent interactions + • PromptFilter removes responses and resource-only rows + • Result: Clean dataset of just sales team questions to the agent + + REPLAY MODE COMBINATIONS: + All filter combinations work in replay mode except GroupNames + Use -UserIds with explicit email addresses instead of -GroupNames + Example: -RAWInputCSV "data.csv" -UserIds "user@contoso.com" -AgentsOnly -PromptFilter Both + + PowerShell 5.1 & 7+ supported. Parallel query retrieval and explosion processing require 7+. + +.EXECUTIONPOLICY + No internal execution policy bypass. Use external host invocation if needed: + powershell.exe -ExecutionPolicy Bypass -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 + pwsh.exe -ExecutionPolicy Bypass -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 + +.POWERSHELLVERSIONS + PS 5.1 & 7+. Query parallelization and explosion parallelization require PS 7+. + +.EXAMPLE + # Basic export with auto-generated timestamped filename + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 -OutputPath C:\Temp\ +.EXAMPLE + # Array explosion mode with auto-generated filename + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -ExplodeArrays -StartDate 2025-10-01 -EndDate 2025-10-02 -OutputPath C:\Temp\ +.EXAMPLE + # Deep column explosion with auto-generated filename + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -ExplodeDeep -StartDate 2025-10-01 -EndDate 2025-10-02 -OutputPath C:\Temp\ +.EXAMPLE + # PowerShell 5.1 compatible (no parallelization) + powershell -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 -OutputPath C:\Temp\ +.EXAMPLE + # Offline replay (simple forced explosion) of a previously exported raw CSV + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -RAWInputCSV .\output\Copilot_RAW_20251001.csv -OutputPath C:\Temp\ +.EXAMPLE + # Offline replay with date & activity filtering + deep flatten + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -RAWInputCSV .\output\Copilot_RAW_20251001.csv -ExplodeDeep -StartDate 2025-10-01 -EndDate 2025-10-02 -ActivityTypes CopilotInteraction -OutputPath C:\Temp\ +.EXAMPLE + # Deep flatten (wide) with higher schema sample & moderate chunk size (balance column coverage vs memory) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -ExplodeDeep -StartDate 2025-10-01 -EndDate 2025-10-02 -StreamingSchemaSample 4000 -StreamingChunkSize 3000 -OutputPath C:\Temp\ +.EXAMPLE + # Extremely wide deep flatten: maximize schema sample, reduce chunk size for lower peak memory + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -ExplodeDeep -StartDate 2025-10-01 -EndDate 2025-10-02 -StreamingSchemaSample 6000 -StreamingChunkSize 1500 -OutputPath C:\Temp\ +.EXAMPLE + # Fast header freeze (narrow schema expectation) – smaller sample, larger chunk for throughput + # NOTE: In parallel mode (PS7+), full schema discovery scans ALL rows regardless of StreamingSchemaSample. + # These tuning examples primarily affect serial mode (PS5.1 or -ExplosionThreads 1). + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -ExplodeDeep -StartDate 2025-10-01 -EndDate 2025-10-02 -StreamingSchemaSample 800 -StreamingChunkSize 6000 -OutputPath C:\Temp\ +.EXAMPLE + # Filter to only records with agents present + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -ExplodeArrays -StartDate 2025-10-01 -EndDate 2025-10-02 -AgentsOnly -OutputPath C:\Temp\ +.EXAMPLE + # Filter to only records WITHOUT agents + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -ExplodeArrays -StartDate 2025-10-01 -EndDate 2025-10-02 -ExcludeAgents -OutputPath C:\Temp\ +.EXAMPLE + # Filter to only prompt messages (Message_isPrompt = True) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -ExplodeArrays -StartDate 2025-10-01 -EndDate 2025-10-02 -PromptFilter Prompt -OutputPath C:\Temp\ +.EXAMPLE + # Filter to only response messages (Message_isPrompt = False) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -ExplodeArrays -StartDate 2025-10-01 -EndDate 2025-10-02 -PromptFilter Response -OutputPath C:\Temp\ +.EXAMPLE + # Combine filters: agents + prompts only + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -ExplodeArrays -StartDate 2025-10-01 -EndDate 2025-10-02 -AgentsOnly -PromptFilter Prompt -OutputPath C:\Temp\ +.EXAMPLE + # Filter to specific users + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 -UserIds "john.doe@contoso.com","jane.smith@contoso.com" -OutputPath C:\Temp\ +.EXAMPLE + # Emit metrics JSON alongside CSV (auto-generated timestamped filename) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-01 -EmitMetricsJson -OutputPath C:\Temp\ +.EXAMPLE + # Emit metrics JSON to custom path + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-01 -EmitMetricsJson -MetricsPath C:\Temp\purview_metrics_20251001.json -OutputPath C:\Temp\ +.EXAMPLE + # AutoCompleteness remediation workflow: first run incomplete (exit code 10), second run resolves saturated windows + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-05 -EndDate 2025-10-05 -EmitMetricsJson -OutputPath C:\Temp\ + # (Exit code 10 indicates saturated windows remain) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-05 -EndDate 2025-10-05 -AutoCompleteness -EmitMetricsJson -OutputPath C:\Temp\ +.EXAMPLE + # Filter to security group members (automatically expanded) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 -GroupNames "Executive Leadership" -OutputPath C:\Temp\ +.EXAMPLE + # Filter to multiple groups + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 -GroupNames "Executive Team","Engineering Managers" -OutputPath C:\Temp\ +.EXAMPLE + # Graph API: SharePoint/OneDrive document activity with record & service filters + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-12-01 -EndDate 2025-12-02 -ActivityTypes FileAccessed,FilePreviewed -RecordTypes sharePointFileOperation -ServiceTypes SharePoint,OneDrive -OutputPath C:\Temp\ +.EXAMPLE + # Microsoft 365 usage bundle (Exchange, SharePoint, OneDrive, Teams, Forms, Stream, Planner, PowerApps) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-11-01 -EndDate 2025-11-02 -IncludeM365Usage -CombineOutput -OutputPath C:\Temp\ +.EXAMPLE + # Export with execution telemetry for performance analysis + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-11-01 -EndDate 2025-11-02 -IncludeTelemetry -OutputPath C:\Temp\ +.EXAMPLE + # Combine individual users and groups + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 -UserIds "ceo@contoso.com" -GroupNames "Board of Directors" -OutputPath C:\Temp\ +.EXAMPLE + # Replay mode with user filtering (client-side filtering from JSON) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -RAWInputCSV .\output\Copilot_RAW_20251001.csv -UserIds "john.doe@contoso.com","jane.smith@contoso.com" -OutputPath C:\Temp\ +.EXAMPLE + # COMBINING FILTERS: User + PromptFilter (conversation focus, removes resource-only rows) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 -UserIds "poweruser@contoso.com" -PromptFilter Both -OutputPath C:\Temp\ +.EXAMPLE + # COMBINING FILTERS: Group + Agent (team adoption of specific agent) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 -GroupNames "Sales Team" -AgentsOnly -OutputPath C:\Temp\ +.EXAMPLE + # COMBINING FILTERS: User + Agent + PromptFilter (prompts sent to agents by specific users) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 -UserIds "analyst@contoso.com" -AgentId "DataAnalysis.Agent" -PromptFilter Prompt -OutputPath C:\Temp\ +.EXAMPLE + # COMBINING FILTERS: Replay mode with User + Agent + PromptFilter + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -RAWInputCSV .\data.csv -UserIds "exec@contoso.com" -AgentsOnly -PromptFilter Both -OutputPath C:\Temp\ + +.EXAMPLE + # APPENDING: Append to existing CSV file (relative filename in OutputPath directory) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-15 -EndDate 2025-10-16 -AppendFile "MyReport.csv" -OutputPath C:\Temp\ +.EXAMPLE + # APPENDING: Append to existing CSV file (full path) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-15 -EndDate 2025-10-16 -AppendFile "C:\Data\Audit\CopilotActivity.csv" +.EXAMPLE + # APPENDING: Append to existing Excel workbook (requires -ExportWorkbook, -CombineOutput recommended) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-15 -EndDate 2025-10-16 -AppendFile "WeeklyReport.xlsx" -ExportWorkbook -CombineOutput -OutputPath C:\Temp\ +.EXAMPLE + # APPENDING: Append with single activity type to existing CSV + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-15 -EndDate 2025-10-16 -AppendFile "CopilotOnly.csv" -ActivityTypes CopilotInteraction -OutputPath C:\Temp\ + +.EXAMPLE + # Export to Excel workbook (multi-tab by activity type - default behavior) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 -ExportWorkbook + +.EXAMPLE + # Export to Excel workbook with combined output (single-tab with all activity types) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 -ExportWorkbook -CombineOutput + +.EXAMPLE + # CSV export with separate files per activity type (default behavior) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 -ActivityTypes CopilotInteraction,ConnectedAIAppInteraction + +.EXAMPLE + # CSV export with combined output (single file with all activity types) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 -ActivityTypes CopilotInteraction,ConnectedAIAppInteraction -CombineOutput + +.EXAMPLE + # Append data to existing Excel workbook (same activity types) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-03 -EndDate 2025-10-04 -ExportWorkbook -AppendFile + +.EXAMPLE + # Excel export with DSPM features (multi-tab with AIAppInteraction) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 -ExportWorkbook -IncludeDSPMForAI -ActivityTypes CopilotInteraction,AIAppInteraction + +.EXAMPLE + # Excel export with array explosion and deep flattening + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-10-01 -EndDate 2025-10-02 -ExportWorkbook -ExplodeDeep + +.EXAMPLE + # Resume interrupted operation (auto-discover checkpoint in OutputPath) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -Resume -OutputPath C:\Temp\ + +.EXAMPLE + # Resume from specific checkpoint file + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -Resume "C:\Temp\.pax_checkpoint_20251215_143022.json" + +.EXAMPLE + # Resume with Force (use most recent checkpoint without prompting) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -Resume -OutputPath C:\Temp\ -Force + +.EXAMPLE + # Parallel explosion with explicit 8 threads (PS7+ only) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -ExplodeDeep -StartDate 2025-10-01 -EndDate 2025-10-02 -ExplosionThreads 8 -OutputPath C:\Temp\ + +.EXAMPLE + # Force serial explosion for debugging/comparison (disables parallel) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -ExplodeDeep -StartDate 2025-10-01 -EndDate 2025-10-02 -ExplosionThreads 1 -OutputPath C:\Temp\ + +.NOTES + Reliability Features: + - Automatic retry: Failed partitions retried up to 3 times with smart delays + - Status tracking: Each partition tracked with QueryId and QueryName throughout execution + - Partial success: Script continues with successful data even if some partitions fail + - End summary: Detailed report showing Complete/Incomplete/Failed partitions + - Query names: Visible in Purview as PAX_Query__PartX/Total + Example: PAX_Query_20241101_0000-20241101_0100_Part27/134 + + Concurrency Control: + - MaxConcurrency (default: 12): Single parameter for both modes + • EOM mode: Limits concurrent serial queries + • Graph API mode: Limits concurrent partition execution + - Replaced previous MaxActivePartitions parameter (use MaxConcurrency instead) + - Example: -MaxConcurrency 8 (reduces from default 12 for rate-limit sensitive environments) + + Graph API Version Detection: + PAX automatically detects whether to use v1.0 (GA) or beta endpoints for the Microsoft Graph + security audit API. Detection occurs on first API call and is remembered for the session. This + ensures seamless transition when Microsoft promotes the API from beta to v1.0 (expected Q1 2026). + + Version Configuration (near top of script, manually editable): + $script:GraphAuditApiVersion_Current = 'v1.0' # Try this version first + $script:GraphAuditApiVersion_Previous = 'beta' # Fallback if current unavailable + + Version Selection: + • If current version available: Uses configured current (default: v1.0) + • If current version unavailable: Falls back to configured previous (default: beta) + • Detection is remembered per session (no repeated checks) + + Benefits: + • Zero-downtime transition when API reaches GA + • Backward compatibility with beta-only tenants + • Manual override capability by editing version variables at top of script + • Single-line log output shows which version is being used + + Future Updates: + If Microsoft releases a new version (e.g., v2.0), simply edit the variables: + $script:GraphAuditApiVersion_Current = 'v2.0' + $script:GraphAuditApiVersion_Previous = 'v1.0' + + Example End-of-Run Summary: + ═══════════════════════════════════════════════════════════════ + QUERY SUBMISSION SUMMARY + ═══════════════════════════════════════════════════════════════ + Total Partitions: 134 + Sent and Complete: 131 + [!] Sent but Incomplete: 2 + ✗ Never Sent: 1 + ═══════════════════════════════════════════════════════════════ + + Checkpoint & Resume: + All authentication modes automatically create checkpoint files to preserve progress + during long-running operations. This enables resumption after Ctrl+C, network failures, + token expiry, or any interruption without losing completed work. + + Checkpoint Behavior: + • Created automatically for ALL auth modes (WebLogin, DeviceCode, AppRegistration) + • Saved to OutputPath as .pax_checkpoint_.json + • Updated after each partition completes + • Deleted automatically on successful completion + • Stores ALL processing parameters for complete state restoration + + Token Refresh: + • AppRegistration mode: + - Proactive refresh at ~45-50 minutes (before token expiry) + - Reactive refresh on 401 Unauthorized (backup) + - Fully automatic, silent re-authentication using stored credentials + • Interactive modes (WebLogin/DeviceCode): + - Reactive only: Triggered when 401 Unauthorized detected + - Attempts silent refresh first (using SDK cached refresh token) + - Prompts user only if silent refresh fails + • 403 Forbidden errors: Indicates permissions issue, NOT token expiry + - Token refresh will not help; check AuditLog.Read.All consent and roles + + Resume Mode (Standalone): + The -Resume switch restores ALL settings from the checkpoint file. + No other processing parameters can be specified with -Resume. + + Allowed with -Resume: + • -Force Use most recent checkpoint without prompting + • -Auth Override authentication method + • -TenantId Override tenant ID (for AppRegistration) + • -ClientId Override client ID (for AppRegistration) + • -ClientSecret Provide client secret (for AppRegistration) + + Usage Examples: + • -Resume Auto-discover checkpoint in current directory + • -Resume "path\to\file" Use specific checkpoint file + • -Resume -Force Use most recent checkpoint without prompting + • -Resume -Auth DeviceCode Resume with different auth method + + API Record Limits & Auto-Subdivision: + Graph API 1,000,000 Record Limit: + The Microsoft Graph security/auditLog API has a hard limit of 1,000,000 records per query. + PAX automatically detects when this limit is reached and handles it gracefully. + + Detection: + • Identified when a partition returns exactly 1,000,000 records with no nextLink + • Warning displayed: "[SUBDIVISION] Partition X/Y - Fetched 1,000,000 records (Graph API limit reached)" + + Auto-Subdivision: + • Uses same BlockHours subdivision algorithm as EOM 10K limit handling + • Partition time window is halved and re-queried recursively + • Minimum window: 0.016667 hours (1 minute) - cannot subdivide below this + • If minimum reached, warning displayed and available records returned + + Recommendations for High-Volume Tenants: + • Use smaller -BlockHours (e.g., 0.25 or 0.1) for very active tenants + • Consider shorter date ranges for initial exports + • Monitor "[SUBDIVISION]" messages to tune BlockHours + + EOM 10,000 Record Limit: + The Exchange Online Management (EOM) Search-UnifiedAuditLog cmdlet returns maximum 10,000 + records per query. PAX automatically subdivides time windows when this limit is reached. + + Performance Optimization: + Parallel Explosion (PS7+ only): + After records are retrieved from Purview, the explosion phase (converting records + to rows with -ExplodeArrays or -ExplodeDeep) can be parallelized for significant + speedup on large datasets. + + Behavior: + • Automatic on PS7+ when >500 records retrieved + • Uses job queue pattern: many small chunks (~1000 records) with N concurrent workers + • Better load balancing than fixed chunks when record complexity varies + • Full schema discovery: scans ALL rows for 100% column coverage (not sampling) + + Control via -ExplosionThreads: + • 0 (default): Auto-detect based on CPU cores (2 to 8 threads) + • 1: Force serial processing (for debugging or comparison) + • 2-8: Explicit thread count (capped at 8 for stability) + + Output Equivalence: + • Identical columns, data values, and row counts vs serial mode + • Row order may differ (chunks complete in parallel, not sequentially) + • Works with -ExplodeArrays, -ExplodeDeep, live mode, and replay mode + +.PARAMETER StartDate + Start date for audit log query in live mode (format: yyyy-MM-dd or MM/dd/yyyy). + In replay mode (-RAWInputCSV), acts as inclusive lower bound filter on CreationDate. + +.PARAMETER EndDate + End date for audit log query in live mode (format: yyyy-MM-dd or MM/dd/yyyy). + In replay mode (-RAWInputCSV), acts as exclusive upper bound filter on CreationDate. + +.PARAMETER OutputPath + Directory path where all output files will be created with auto-generated timestamped filenames. + Default: C:\Temp\ + +.PARAMETER FlatDepth + Maximum JSON flatten depth for exploding CopilotEventData and AuditData (default 120). + + The script automatically generates descriptive filenames based on: + • Activity types being exported + • Export mode (CSV vs Excel, combined vs separate) + • Current timestamp (yyyyMMdd_HHmmss format) + + Examples of auto-generated filenames: + • Purview_Audit_CopilotInteraction_20251110_143022.csv + • Purview_Audit_CombinedUsageActivity_20251110_143022.csv + • Purview_Audit_MultiTab_20251110_143022.xlsx + + Note: OutputPath accepts ONLY directory paths, not filenames. + Use -AppendFile parameter to specify a custom filename for appending to existing files. + +.PARAMETER Auth + Authentication method. Options: + • WebLogin – Interactive browser authentication + • DeviceCode – Device code flow for headless scenarios + • Credential – Legacy username/password prompt or GRAPH_* env vars + • Silent – Managed identity or pre-cached token + • AppRegistration – Service principal using client secret or certificate + +.PARAMETER TenantId + Azure AD tenant ID (GUID). Required for -Auth AppRegistration unless GRAPH_TENANT_ID + environment variable is set. + +.PARAMETER ClientId + Azure AD app registration client ID (GUID). Required for -Auth AppRegistration unless + GRAPH_CLIENT_ID environment variable is set. + +.PARAMETER ClientSecret + Client secret value for app registration authentication. You can pass it directly, + convert from a secure string, or set it through GRAPH_CLIENT_SECRET. + +.PARAMETER ClientCertificateThumbprint + Thumbprint of a certificate located in the CurrentUser or LocalMachine "My" store. + Used when -Auth AppRegistration should authenticate with a certificate instead of a + client secret. Optional environment variable: GRAPH_CLIENT_CERT_THUMBPRINT. + +.PARAMETER ClientCertificateStoreLocation + Certificate store to search when using ClientCertificateThumbprint. Valid values: + CurrentUser (default) or LocalMachine. + +.PARAMETER ClientCertificatePath + Path to a PFX file containing the certificate for app registration auth. Optional + environment variable: GRAPH_CLIENT_CERT_PATH. + +.PARAMETER ClientCertificatePassword + Password for the PFX file specified by ClientCertificatePath. Accepts secure string or + plain text (converted internally). Optional environment variable: + GRAPH_CLIENT_CERT_PASSWORD. + +.PARAMETER BlockHours + Time window size in hours for each audit log query block. + Range: 0.016667 to 24 hours. Default: 0.5 (30 minutes) + +.PARAMETER PartitionHours + Time partition size in hours for Graph API parallel processing. + Range: 1-72. Default: 0 (auto-calculated based on date range and MaxPartitions) + +.PARAMETER MaxPartitions + Maximum number of time partitions for parallel query execution. + Range: 1-1000. Default: 160 + +.PARAMETER ResultSize + Maximum records returned per Search-UnifiedAuditLog query. + Range: 1 to 10000. Default: 10000 + +.PARAMETER PacingMs + Delay in milliseconds between audit log queries for throttling control. + Range: 0 to 10000. Default: 0 + +.PARAMETER ActivityTypes + Array of Purview audit activity types to query. + Default: @('CopilotInteraction') + Examples: CopilotInteraction, ConnectedAIAppInteraction, AIInteraction, AIAppInteraction + Note: CopilotInteraction captures ALL Microsoft 365 Copilot usage including Teams meetings (AppHost="Teams") + +.PARAMETER RecordTypes + Filter audit records by record type (Graph API mode only). + Accepts one or more record type names that map to Microsoft 365 workload categories. + Examples: SharePointFileOperation, ExchangeItem, MicrosoftTeams, AzureActiveDirectory + + BEHAVIOR: + • Standard mode: Filters server-side to specified record types + • With -IncludeM365Usage: Merged with curated M365 usage bundle record types + + LIMITATIONS: + • Graph API mode only (not supported with -UseEOM) + • Replay mode (-RAWInputCSV): Not supported (server-side filter only) + + Use with -ActivityTypes and -ServiceTypes for precise workload targeting. + +.PARAMETER ServiceTypes + Filter audit records by service/workload (Graph API mode only). + Accepts one or more service names representing Microsoft 365 workloads. + Examples: Exchange, SharePoint, OneDrive, MicrosoftTeams, AzureActiveDirectory + + BEHAVIOR: + • Standard mode: Filters server-side to specified services + • Note: Multiple services may cause separate queries per service + + CRITICAL LIMITATIONS: + • Graph API mode only (not supported with -UseEOM) + • Replay mode (-RAWInputCSV): Not supported (server-side filter only) + • IGNORED when -IncludeM365Usage is active: The M365 usage bundle intentionally + sets ServiceTypes to null for optimal single-pass query performance. + Your -ServiceTypes values will be silently overridden. + + RECOMMENDATION: For M365 usage scenarios, use -IncludeM365Usage without -ServiceTypes. + For targeted workload queries without M365 bundle, use -ServiceTypes with -ActivityTypes. + +.PARAMETER ExplodeArrays + Enable Purview array explosion to canonical 35-column schema. + Explodes Messages[], Resources[], and Contexts[] arrays into separate rows. + +.PARAMETER ExplodeDeep + Enable deep flattening with Purview array explosion. + Produces 35 fixed columns + appended deep-flattened CopilotEventData.* columns. + +.PARAMETER RAWInputCSV + Path to previously exported raw Purview audit CSV for offline replay. + Must contain AuditData JSON column. Bypasses live Search-UnifiedAuditLog queries. + Forces array explosion even if -ExplodeArrays not specified. + +.PARAMETER MaxConcurrency + Maximum concurrent queries/partitions (1-10). + - EOM mode: limits concurrent serial queries + - Graph API mode: limits concurrent partition execution + Default: 10 (Microsoft Purview enforces a 10 concurrent search job limit per user account) + +.PARAMETER EnableParallel + Force enable parallel processing (overrides ParallelMode setting). + +.PARAMETER MaxParallelGroups + Maximum activity groups processed concurrently in parallel mode. + Range: 0 to 50. Default: 4 + +.PARAMETER ParallelMode + Parallel processing mode. Options: Off, On, Auto (default) + Auto: Enables parallel for PowerShell 7+ environments automatically. + +.PARAMETER ExplosionThreads + Number of threads for parallel explosion processing (post-retrieval phase). + 0 (default): Auto-detect based on CPU cores (2 to 8 threads) + 1: Force serial processing (disable parallel explosion) + 2-8: Explicit thread count (capped at 8 for stability) + Requires PowerShell 7+. Falls back to serial on PS5. + +.PARAMETER DisableAdaptive + Disable adaptive safeguards (memory/latency/concurrency smoothing). + +.PARAMETER ProgressSmoothingAlpha + Weight for smoothing dynamic progress total recalculation. + Range: 0.0 to 1.0. Default: 0.3 (0 = off) + +.PARAMETER HighLatencyMs + Partition average latency threshold (ms) triggering concurrency reduction. + Range: 1000 to 600000. Default: 90000 + +.PARAMETER MemoryPressureMB + Working set (MB) threshold to trigger concurrency reduction. + Range: 256 to 32768. Default: 1500 + +.PARAMETER MaxMemoryMB + Maximum process memory (MB) before flushing in-memory records to disk. + When exceeded, clears $allLogs after confirmed JSONL save to free memory. + Not compatible with -ExplodeDeep or -ExplodeArrays (ignored when explosion enabled). + Range: -1 to 65536. Default: -1 (auto = 75% of system RAM). Use 0 to disable. + +.PARAMETER StatusIntervalSeconds + How often (in seconds) to display a status update during job polling and backpressure waits. + Reduce to see more frequent progress output; increase to reduce console noise on long runs. + Range: 30 to 600. Default: 60 + +.PARAMETER LowLatencyMs + Sustained low latency threshold to consider concurrency step-up. + Range: 100 to 600000. Default: 20000 + +.PARAMETER LowLatencyConsecutive + Required consecutive low-latency groups before step-up. + Range: 1 to 10. Default: 2 + +.PARAMETER ThroughputDropPct + Percentage drop vs baseline required (with high latency) to justify reduction. + Range: 1 to 100. Default: 15 + +.PARAMETER ThroughputSmoothingAlpha + EMA smoothing for throughput baseline. + Range: 0.0 to 1.0. Default: 0.3 + +.PARAMETER AdaptiveConcurrencyCeiling + Upper bound for adaptive concurrency step-ups. + Range: 1 to 50. Default: 6 + +.PARAMETER ExportProgressInterval + Frequency of progress updates during export phase. + Range: 1 to 10000. Default: 10 + +.PARAMETER StreamingSchemaSample + Number of rows sampled before freezing CSV schema in streaming mode (SERIAL MODE ONLY). + Range: 100 to 50000. Default: 5000 + Higher values capture more columns but delay schema freeze. + NOTE: In parallel mode (PS7+), full schema discovery is used instead, scanning ALL rows + for 100% column coverage. This parameter only affects serial mode processing. + +.PARAMETER StreamingChunkSize + Number of rows per write batch in streaming CSV export. + Range: 100 to 50000. Default: 5000 + Lower values reduce memory pressure, higher values improve throughput. + +.PARAMETER AgentId + Filter to records matching specific AgentId value(s). + Example: -AgentId "CopilotStudio.Declarative.abc123" + +.PARAMETER AgentsOnly + Filter to records with any AgentId present (mutually exclusive with -ExcludeAgents). + +.PARAMETER PromptFilter + Filter messages by isPrompt property. + Options: Prompt (True), Response (False), Both (True/False), Null (undefined) + +.PARAMETER CircuitBreakerThreshold + Consecutive block failures before opening circuit breaker. + Range: 1 to 50. Default: 5 + +.PARAMETER CircuitBreakerCooldownSeconds + Cooldown duration (seconds) after circuit breaker trips. + Range: 5 to 3600. Default: 120 + +.PARAMETER BackoffBaseSeconds + Base seconds for exponential backoff between block retries. + Range: 0.1 to 120. Default: 1.0 + +.PARAMETER BackoffMaxSeconds + Maximum cap for exponential backoff delay (seconds). + Range: 1 to 600. Default: 45 + +.PARAMETER ExcludeAgents + Filter to records WITHOUT AgentId (mutually exclusive with -AgentId/-AgentsOnly). + +.PARAMETER UserIds + Filter to specific user identifier(s). + LIVE MODE: Server-side filtering at Purview (efficient). + REPLAY MODE: Client-side filtering from AuditData JSON (slower). + Accepts UPN, SMTP address, or user GUID. + +.PARAMETER GroupNames + Filter to members of distribution/security group(s). + LIVE MODE ONLY: Groups automatically expanded after authentication. + REPLAY MODE: NOT SUPPORTED (requires authentication). + +.PARAMETER Help + Display script help information. + +.PARAMETER EmitMetricsJson + Emit structured metrics JSON alongside output file. + Default filename: .metrics.json + +.PARAMETER MetricsPath + Override metrics output path. Requires -EmitMetricsJson. + +.PARAMETER AutoCompleteness + Aggressively subdivide windows returning server 10K limit until complete or minimum window reached. + +.PARAMETER IncludeTelemetry + Export execution telemetry CSV alongside audit data (Graph API mode only). + Creates a separate CSV file with one row per partition containing timing and performance metrics. + Useful for analyzing query execution patterns, identifying bottlenecks, and capacity planning. + File naming: _telemetry_.csv + Not available in EOM mode or OnlyUserInfo mode. + +.PARAMETER IncludeCopilotInteraction + Adds CopilotInteraction to the activity list even when you provide a custom -ActivityTypes array. + Useful when combining Copilot telemetry with targeted classic workloads without redefining defaults. + +.PARAMETER IncludeM365Usage + Adds a curated Microsoft 365 usage bundle spanning Exchange, SharePoint, OneDrive, Teams, + Forms, Stream, Planner, PowerApps, and Office desktop apps. + + ACTIVITY TYPES INCLUDED: + Exchange: MailItemsAccessed, Send, SendOnBehalf, SoftDelete, HardDelete, + MoveToDeletedItems, CopyToFolder + SharePoint/OneDrive (Files): FileAccessed, FileDownloaded, FileUploaded, FileModified, + FileDeleted, FileMoved, FileCheckedIn, FileCheckedOut, FileRecycled, FileRestored, + FileVersionsAllDeleted + SharePoint/OneDrive (Sharing): SharingInvitationCreated, SharingInvitationAccepted, + SharedLinkCreated, SharingRevoked, RemovedFromSecureLink + Groups: AddMemberToUnifiedGroup, RemoveMemberFromUnifiedGroup + Teams (Team/Channel): TeamCreated, TeamDeleted, TeamArchived, TeamSettingChanged, + TeamMemberAdded, TeamMemberRemoved, MemberAdded, MemberRemoved, MemberRoleChanged, + ChannelAdded, ChannelDeleted, ChannelSettingChanged, ChannelOwnerResponded, + ChannelMessageSent, ChannelMessageDeleted, BotAddedToTeam, BotRemovedFromTeam, + TabAdded, TabRemoved, TabUpdated, ConnectorAdded, ConnectorRemoved, ConnectorUpdated + Teams (Chat/Messaging): TeamsSessionStarted, ChatCreated, ChatRetrieved, ChatUpdated, + MessageSent, MessageRead, MessageDeleted, MessageUpdated, MessagesListed, + MessageCreation, MessageCreatedHasLink, MessageEditedHasLink, + MessageHostedContentRead, MessageHostedContentsListed, SensitiveContentShared + Teams (Meetings): MeetingCreated, MeetingUpdated, MeetingDeleted, MeetingStarted, MeetingEnded, + MeetingParticipantJoined, MeetingParticipantLeft, MeetingParticipantRoleChanged, + MeetingRecordingStarted, MeetingRecordingEnded, MeetingDetail, MeetingParticipantDetail, + LiveNotesUpdate, AINotesUpdate, RecordingExported, TranscriptsExported + Teams (Apps/Approvals): AppInstalled, AppUpgraded, AppUninstalled, CreatedApproval, + ApprovedRequest, RejectedApprovalRequest, CanceledApprovalRequest + Office Apps: Create, Edit, Open, Save, Print + Forms: CreateForm, EditForm, DeleteForm, ViewForm, CreateResponse, SubmitResponse, + ViewResponse, DeleteResponse + Stream: StreamModified, StreamViewed, StreamDeleted, StreamDownloaded + Planner: PlanCreated, PlanDeleted, PlanModified, TaskCreated, TaskDeleted, TaskModified, + TaskAssigned, TaskCompleted + PowerApps: LaunchedApp, CreatedApp, EditedApp, DeletedApp, PublishedApp + Copilot: CopilotInteraction + + RECORD TYPES INCLUDED: + ExchangeAdmin, ExchangeItem, ExchangeMailbox, SharePointFileOperation, + SharePointSharingOperation, SharePoint, OneDrive, MicrosoftTeams, OfficeNative, + MicrosoftForms, MicrosoftStream, PlannerPlan, PlannerTask, PowerAppsApp + + IMPORTANT: When this switch is active, -ServiceTypes parameter is ignored and set to null. + This ensures optimal single-pass query performance across all Microsoft 365 workloads. + +.PARAMETER IncludeDSPMForAI + Include DSPM for AI activity types: ConnectedAIAppInteraction, AIInteraction, AIAppInteraction. + Note: Some activity types may trigger PAYG billing. See billing information prompt for details. + +.PARAMETER ExcludeCopilotInteraction + Exclude Microsoft 365 Copilot activity type (CopilotInteraction). + Overrides custom list and default behavior. Use with -IncludeDSPMForAI to query only DSPM activity types. + +.PARAMETER ExportWorkbook + Export data to Excel workbook (.xlsx) instead of CSV files. + Without -CombineOutput: Creates multi-tab workbook with one tab per activity type. + With -CombineOutput: Creates single-tab workbook with all data combined. + Tab formatting: AutoSize columns, freeze top row, bold headers, preserve leading zeros. + Requires ImportExcel module (auto-installs if missing). + File naming includes "DSPM" suffix only if DSPM features enabled. + +.PARAMETER AppendFile + Append activity data to an existing output file instead of creating a new timestamped file. + Accepts either a filename (combined with -OutputPath) or a full path to the existing file. + + **Filename Resolution:** + • Relative filename: -AppendFile "MyReport.csv" → Uses -OutputPath directory + • Full path: -AppendFile "C:\Data\Report.xlsx" → Uses exact path specified + + **Requirements:** + • File must already exist (create it first without -AppendFile) + • File extension must match export mode (.csv without -ExportWorkbook, .xlsx with -ExportWorkbook) + • Cannot be used with -IncludeUserInfo or -OnlyUserInfo (EntraUsers data is never appended) + • Requires single-file output mode (see Single-File Output Requirements below) + + **Single-File Output Requirements:** + Must use ONE of these modes to ensure single output file: + 1. Excel mode: -ExportWorkbook (all activity types in one .xlsx with multiple tabs) + 2. Combined CSV: -CombineOutput (all activity types merged into one .csv file) + 3. Single activity type: -ActivityTypes CopilotInteraction (only one activity type selected) + + **CSV Mode Behavior:** + • Appends rows to existing CSV file + • Validates headers match exactly (case-sensitive column names and order) + • If headers mismatch: Script exits with error showing column differences + • Compatible with: Standard (1:1), -ExplodeArrays, -ExplodeDeep modes + + **Excel Mode Behavior:** + • Requires -ExportWorkbook parameter + • Validates column headers match existing tabs + • If headers match: Appends new rows to existing tabs + • If headers mismatch: Creates timestamped duplicate tabs (preserves both datasets) + • Compatible with both multi-tab and -CombineOutput modes + + **EntraUsers Export Restrictions:** + • Cannot use -AppendFile with -IncludeUserInfo or -OnlyUserInfo + • EntraUsers data represents a point-in-time snapshot, not time-based activity data + • Each EntraUsers export should create a fresh dataset with current user information + + **Always Timestamped (Never Overwritten):** + Even when using -AppendFile, these files are always timestamped: + • Log files: *_.log + • Telemetry files: *_telemetry_.csv + • Metrics files: *_metrics_.json (unless -MetricsPath specified) + + **Error Scenarios:** + • File not found: Script exits (create initial file first without -AppendFile) + • CSV header mismatch: Script exits with detailed column differences + • Excel without -ExportWorkbook: Script exits with error + • Multiple output files would be created: Script exits with single-file output requirement + • EntraUsers modes: Script exits (append not supported for snapshot data) + +.PARAMETER CombineOutput + Combines all activity types into a single output file or tab. + + **Default Behavior (without -CombineOutput):** + • CSV: One separate CSV file per activity type (plus EntraUsers_MAClicensing_.csv if -IncludeUserInfo) + • Excel: Multi-tab workbook (one tab per activity type; EntraUsers tab appended last if -IncludeUserInfo) + + **With -CombineOutput switch:** + • CSV: Single combined activity file named Purview_Audit_CombinedUsageActivity_.csv (plus separate EntraUsers_MAClicensing_.csv if -IncludeUserInfo) + • Excel: First tab named CombinedUsageActivity (no timestamp) with all activity rows; separate EntraUsers tab if -IncludeUserInfo + + Entra user/org data is never merged into the combined activity dataset—always exported separately. + + Recommended: Use combined mode for ingestion pipelines; separated mode for granular analysis. + +.PARAMETER Force + Force execution without interactive prompts. Automatically accepts defaults for: + 1. DSPM for AI Billing Information: Automatically continues when -IncludeDSPMForAI is enabled + 2. Conflict Resolution (ExcludeCopilotInteraction): Automatically honors -ExcludeCopilotInteraction when conflict with -ActivityTypes + Use this switch for unattended/automated executions (CI/CD pipelines, scheduled tasks). + +.PARAMETER SkipDiagnostics + Skip pre-query capability diagnostics (advanced). + +.PARAMETER UseEOM + Use Exchange Online Management mode with Search-UnifiedAuditLog cmdlet. + When specified, queries Purview audit logs via EOM PowerShell module (serial processing only). + Default mode (when omitted) uses Microsoft Graph Security API with parallel processing support. + + PERMISSIONS REQUIRED (EOM mode): + • Exchange Online RBAC Roles: + - View-Only Audit Logs role + - Compliance Management role group + - Organization Management role group + - Or custom role with Search-UnifiedAuditLog cmdlet permission + + PARALLEL PROCESSING: + EOM mode is SERIAL-ONLY. Parallel processing is automatically disabled. + If -EnableParallel or -ParallelMode is specified with -UseEOM, script will exit with error. + + +.PARAMETER IncludeUserInfo + Include Entra (Microsoft Entra ID) user directory & Copilot license enrichment (Graph API mode only). + Exports an independent EntraUsers file/tab with: + • Core identity & profile details (name, UPN, job, department, location, organization info) + • Account & sync state (accountEnabled, onPremSync attributes, creation / change stamps) + • Manager expansion (identity + basic role/job fields via $expand=manager) + • Contact & routing (mail, proxyAddresses flattened, preferredLanguage) + • License enrichment (assignedLicenses list + hasLicense boolean for Copilot detection) + + Separation Principle: EntraUsers data is never merged into activity rows; always a distinct artifact. + + Requirements: + • Graph API mode (not supported with -UseEOM) + • Graph permissions: User.Read.All, Organization.Read.All + • One-time directory + license fetch at startup (typ. +10–20s) + + License Detection Logic: + 1. Match known Copilot SKU IDs (curated list) + 2. Fallback name pattern search containing "Copilot" for future SKUs + + Performance: Single batched fetch + hashtable lookups; no per-record calls. + + Use Cases: + • License compliance & adoption + • Mapping usage to directory attributes + • Identifying unlicensed usage patterns + + Not available in EOM mode. + +.PARAMETER OnlyUserInfo + Export ONLY Entra user directory and license information (skips all audit log retrieval). + This is a specialized mode for quickly exporting user licensing data without querying audit logs. + + BEHAVIOR: + • Authenticates to Microsoft Graph + • Fetches Entra user directory and license data + • Exports standalone EntraUsers_MAClicensing_.csv file + • Skips all audit log queries (completes in 5-15 seconds vs. minutes/hours) + • Automatically enables -IncludeUserInfo + + OUTPUT: + Single CSV file: EntraUsers_MAClicensing_YYYYMMDD_HHMMSS.csv + Contains 37 columns including: + - Identity fields (UPN, displayName, id, mail) + - Profile data (jobTitle, department, officeLocation) + - Manager hierarchy (manager info expanded) + - License assignments (assignedLicenses + hasCopilotLicense boolean) + + COMPATIBLE PARAMETERS (can be used WITH -OnlyUserInfo): + • -OutputPath : Specify output directory + • -Auth : Choose authentication method (WebLogin, DeviceCode, etc.) + • -ExportWorkbook : Export to Excel instead of CSV + • -CombineOutput : (Has no effect, but allowed for script compatibility) + • -DisableAdaptive : (Has no effect, but allowed) + • -Debug / -Verbose : Enable diagnostic output + + INCOMPATIBLE PARAMETERS (cannot be used with -OnlyUserInfo): + Audit Retrieval: + • StartDate, EndDate : No audit queries to filter + • ActivityTypes : Cleared by -OnlyUserInfo + • IncludeDSPMForAI : Activity type modifier + • ExcludeCopilotInteraction: Activity type modifier + • BlockHours, PartitionHours, MaxPartitions, ResultSize, PacingMs + • AutoCompleteness : Audit log completeness checks + • StreamingSchemaSample, StreamingChunkSize, ExportProgressInterval + + Filtering: + • UserIds, GroupNames : User filtering requires audit logs + • AgentId, AgentsOnly, ExcludeAgents: Agent filtering requires audit logs + • PromptFilter : Message filtering requires audit logs + + Processing: + • ExplodeArrays, ExplodeDeep: Explosion requires audit records + • RAWInputCSV : Replay mode exports audit data + + Parallelization: + • ParallelMode, MaxParallelGroups, MaxConcurrency, EnableParallel + • MaxActivePartitions : Query execution settings + + EOM Mode: + • UseEOM : Exchange Online Management mode + + USE CASES: + 1. License compliance auditing (quick snapshot of all user licenses) + 2. Periodic license data exports for tracking/trending + 3. Standalone user directory exports for cross-referencing + 4. Rapid licensing status checks without audit log overhead + + EXAMPLES: + # Basic user-only export + .\PAX_Purview_Audit_Log_Processor.ps1 -OnlyUserInfo + + # Export to specific directory + .\PAX_Purview_Audit_Log_Processor.ps1 -OnlyUserInfo -OutputPath "D:\UserData\" + .EXAMPLE + # Curated Microsoft 365 usage bundle (adds activity, record, and service filters automatically) + pwsh -File .\PAX_Purview_Audit_Log_Processor.ps1 -StartDate 2025-11-01 -EndDate 2025-11-02 -IncludeM365Usage -CombineOutput -OutputFile C:\Temp\M365Usage.csv + + # Export as Excel workbook + .\PAX_Purview_Audit_Log_Processor.ps1 -OnlyUserInfo -ExportWorkbook + + # Use device code auth (for automation/headless scenarios) + .\PAX_Purview_Audit_Log_Processor.ps1 -OnlyUserInfo -Auth DeviceCode + + # App registration auth (client secret) + .\PAX_Purview_Audit_Log_Processor.ps1 -Auth AppRegistration -TenantId "" -ClientId "" -ClientSecret (ConvertTo-SecureString "" -AsPlainText -Force) + + # App registration auth (certificate thumbprint) + .\PAX_Purview_Audit_Log_Processor.ps1 -Auth AppRegistration -TenantId "" -ClientId "" -ClientCertificateThumbprint "" + + PERFORMANCE: + Typical execution time: 5-15 seconds (vs. minutes/hours for audit log queries) + Network traffic: Minimal (only user directory + license API calls) + + NOT AVAILABLE IN EOM MODE: Requires Microsoft Graph API (user directory/licenses not in EOM). + +.PARAMETER MaxNetworkOutageMinutes + Maximum continuous network outage the script will tolerate during audit log operations (query creation, polling, record retrieval). + Applies to transient network errors: 502 Bad Gateway, 503 Service Unavailable, 504 Gateway Timeout, connection failures. + Script automatically retries failed operations with randomized delays (30-60s) until connectivity is restored or timeout is exceeded. + Clean terminal output shows error summaries with countdown timers; full error details logged to file for troubleshooting. + Progress is preserved - no data loss during network interruptions. + Exceeding this window aborts with clear error message indicating tolerance exceeded. + Default: 30 minutes (adjustable 1-120) + +.PARAMETER Resume + Resume an interrupted operation from a checkpoint file. + Checkpoint files are automatically created during all auth modes to allow resumption + after Ctrl+C, network failures, token expiry, or any interruption. + + IMPORTANT: Resume mode is STANDALONE. + All processing parameters are restored from the checkpoint file. + You cannot specify other parameters with -Resume (except auth overrides). + + USAGE: + -Resume Auto-discover checkpoint in current directory/OutputPath + -Resume "path\to\file" Use specific checkpoint file + + ALLOWED WITH -Resume: + -Force Use most recent checkpoint without prompting + -Auth Override authentication method + -TenantId, -ClientId Override auth credentials (for AppRegistration) + -ClientSecret Provide client secret (for AppRegistration) + + NOT ALLOWED WITH -Resume: + Any other parameter (dates, activities, explosion settings, etc.) + These are all restored from the checkpoint to ensure data consistency. + + CHECKPOINT LOCATION: + Files are created in OutputPath with pattern: .pax_checkpoint_.json + +#> + +param( + [Parameter(Mandatory = $false)] + [string]$StartDate, # Live mode: if omitted (with EndDate) auto-populated later; Replay: optional filter + + [Parameter(Mandatory = $false)] + [string]$EndDate, # Live mode: if omitted (with StartDate) auto-populated; Replay: optional filter + + [Parameter(Mandatory = $false)] + [string]$OutputPath = "C:\Temp\", + + + [Parameter(Mandatory = $false)] + [ValidateSet('WebLogin', 'DeviceCode', 'Credential', 'Silent', 'AppRegistration')] + [string]$Auth = 'WebLogin', + + [Parameter(Mandatory = $false)] + [string]$TenantId, + + [Parameter(Mandatory = $false)] + [string]$ClientId, + + [Parameter(Mandatory = $false)] + [string]$ClientSecret, + + [Parameter(Mandatory = $false)] + [string]$ClientCertificateThumbprint, + + [Parameter(Mandatory = $false)] + [ValidateSet('CurrentUser','LocalMachine')] + [string]$ClientCertificateStoreLocation = 'CurrentUser', + + [Parameter(Mandatory = $false)] + [string]$ClientCertificatePath, + + [Parameter(Mandatory = $false)] + [System.Security.SecureString]$ClientCertificatePassword, + + [Parameter(Mandatory = $false)] + [ValidateRange(0.016667, 24)] + [double]$BlockHours = 0.5, + + [Parameter(Mandatory = $false)] + [ValidateRange(1, 72)] + [int]$PartitionHours = 0, + + [Parameter(Mandatory = $false)] + [ValidateRange(1, 1000)] + [int]$MaxPartitions = 160, + + [Parameter(Mandatory = $false)] + [ValidateRange(1, 10000)] + [int]$ResultSize = 10000, + + [Parameter(Mandatory = $false)] + [ValidateRange(0, 10000)] + [int]$PacingMs = 0, + + [Parameter(Mandatory = $false)] + [string[]]$ActivityTypes = @('CopilotInteraction'), + + [Parameter(Mandatory = $false)] + [string[]]$RecordTypes, + + [Parameter(Mandatory = $false)] + [string[]]$ServiceTypes, + + [Parameter(Mandatory = $false)] + [switch]$ExplodeArrays, + + [Parameter(Mandatory = $false)] + [switch]$ExplodeDeep, + [Parameter(Mandatory = $false)] + [int]$FlatDepth = 120, + # Offline replay of a previously downloaded raw Purview audit CSV (bypasses live Search-UnifiedAuditLog) + [Parameter(Mandatory = $false)] + [string]$RAWInputCSV, + [Parameter(Mandatory = $false)] + # Controls concurrent execution: EOM mode limits serial queries, Graph API mode limits partition parallelism + [int]$MaxConcurrency = 10, + [Parameter(Mandatory = $false)] + [switch]$EnableParallel, + [Parameter(Mandatory = $false)] + [ValidateRange(0, 50)] + # Allows multiple activity groups to be processed concurrently (aligns with Microsoft's ~10 query safe limit) + [int]$MaxParallelGroups = 8, + [Parameter(Mandatory = $false)] + [ValidateSet('Off', 'On', 'Auto')] + # Default now 'Auto' so that PS 7+ environments engage parallel processing automatically unless explicitly turned Off. + [string]$ParallelMode = 'Auto', + [Parameter(Mandatory = $false)] + [ValidateRange(0, 32)] + # 0=auto-detect (2-16 threads based on CPU), 1=serial, 2-32=explicit thread count. Requires PS7+. + [int]$ExplosionThreads = 0, + [Parameter(Mandatory = $false)] + [switch]$DisableAdaptive, # Disable adaptive safeguards (memory/latency/concurrency smoothing) + [Parameter(Mandatory = $false)] + [ValidateRange(0.0,1.0)] + [double]$ProgressSmoothingAlpha = 0.3, # Weight for smoothing dynamic progress total recalculation (0 => off) + [Parameter(Mandatory = $false)] + [ValidateRange(1000,600000)] + [int]$HighLatencyMs = 90000, # Partition average latency threshold (ms) triggering mild concurrency reduction + [Parameter(Mandatory = $false)] + [ValidateRange(256,32768)] + [int]$MemoryPressureMB = 1500, # Working set (MB) threshold to trigger mild concurrency reduction + [Parameter(Mandatory = $false)] + [ValidateRange(-1,65536)] + [int]$MaxMemoryMB = -1, # Max process memory (MB) before flushing $allLogs to disk (-1 = auto 75%, 0 = disabled) + [Parameter(Mandatory = $false)] + [ValidateRange(30,600)] + [int]$StatusIntervalSeconds = 60, # How often (seconds) to display status during polling and backpressure waits + [Parameter(Mandatory = $false)] + [ValidateRange(100,600000)] + [int]$LowLatencyMs = 20000, # Sustained low latency threshold to consider concurrency step-up + [Parameter(Mandatory = $false)] + [ValidateRange(1,10)] + [int]$LowLatencyConsecutive = 2, # Required consecutive low-latency groups before step-up + [Parameter(Mandatory = $false)] + [ValidateRange(1,100)] + [int]$ThroughputDropPct = 15, # % drop vs baseline required (with high latency) to justify reduction + [Parameter(Mandatory = $false)] + [ValidateRange(0.0,1.0)] + [double]$ThroughputSmoothingAlpha = 0.3,# EMA smoothing for throughput baseline + [Parameter(Mandatory = $false)] + [ValidateRange(1,50)] + [int]$AdaptiveConcurrencyCeiling = 6, # Upper bound for adaptive step-ups + [Parameter(Mandatory = $false)] + [ValidateRange(1, 10000)] + [int]$ExportProgressInterval = 10, + + # Streaming export is always-on + [Parameter(Mandatory = $false)] + [ValidateRange(100, 50000)] + [int]$StreamingSchemaSample = 5000, + + [Parameter(Mandatory = $false)] + [ValidateRange(100, 50000)] + [int]$StreamingChunkSize = 5000, + + [Parameter(Mandatory = $false)] + [string[]]$AgentId, + + [Parameter(Mandatory = $false)] + [switch]$AgentsOnly, + + [Parameter(Mandatory = $false)] + [ValidateSet('Prompt', 'Response', 'Both', 'Null')] + [string]$PromptFilter, + + # --- Reliability Enhancements (Backoff & Circuit Breaker) --- + [Parameter(Mandatory = $false)] + [ValidateRange(1,50)] + [int]$CircuitBreakerThreshold = 5, # Consecutive block failures before opening circuit breaker + [Parameter(Mandatory = $false)] + [ValidateRange(5,3600)] + [int]$CircuitBreakerCooldownSeconds = 120, # Cooldown duration after breaker trips + [Parameter(Mandatory = $false)] + [ValidateRange(0.1,120)] + [double]$BackoffBaseSeconds = 1.0, # Base seconds for exponential backoff between block retries + [Parameter(Mandatory = $false)] + [ValidateRange(1,600)] + [int]$BackoffMaxSeconds = 45, # Max cap for exponential backoff delay + + [Parameter(Mandatory = $false)] + [switch]$ExcludeAgents, + + [Parameter(Mandatory = $false)] + [string[]]$UserIds, + + [Parameter(Mandatory = $false)] + [string[]]$GroupNames, + + [Parameter(Mandatory = $false)] + [switch]$Help, + + # Emit structured metrics JSON alongside CSV (OutputFile name with .metrics.json) + [Parameter(Mandatory = $false)] + [switch]$EmitMetricsJson, + + # Override metrics output path (optional). If provided and -EmitMetricsJson specified, writes here instead of OutputFile substitution. + [Parameter(Mandatory = $false)] + [string]$MetricsPath, + + # Ensure completeness: aggressively subdivide any window still returning server 10K limit until below threshold or min window reached. + [Parameter(Mandatory = $false)] + [switch]$AutoCompleteness, + + # DSPM for AI: Include DSPM activity types (ConnectedAIAppInteraction, AIInteraction, AIAppInteraction) + [Parameter(Mandatory = $false)] + [switch]$IncludeCopilotInteraction, + + [Parameter(Mandatory = $false)] + [switch]$IncludeM365Usage, + + [Parameter(Mandatory = $false)] + [switch]$IncludeDSPMForAI, + + # DSPM for AI: Exclude CopilotInteraction activity type (overrides custom list and default fallback) + [Parameter(Mandatory = $false)] + [switch]$ExcludeCopilotInteraction, + + # Excel Export: Export data to Excel workbook (.xlsx) instead of CSV files + [Parameter(Mandatory = $false)] + [switch]$ExportWorkbook, + + # Append data to existing file (CSV or Excel - requires -ExportWorkbook for Excel) + # Provide filename (e.g., "MyReport.xlsx") or full path (e.g., "C:\Data\Report.csv") + [Parameter(Mandatory = $false)] + [string]$AppendFile, + + # Combine all activity types into single output file/tab (CSV or Excel) + # CSV default when omitted: separate files per activity type + # Excel default when omitted: separate tabs per activity type + # Use -CombineOutput switch to merge all activity types into one file/tab + [Parameter(Mandatory = $false)] + [switch]$CombineOutput, + + # Force execution without interactive prompts (PAYG warning, conflict resolution) + [Parameter(Mandatory = $false)] + [switch]$Force, + + # Skip pre-query capability diagnostics (advanced) + [Parameter(Mandatory = $false)] + [switch]$SkipDiagnostics, + + # Use Exchange Online Management mode (Search-UnifiedAuditLog cmdlet, serial-only) + [Parameter(Mandatory = $false)] + [switch]$UseEOM, + + # Include Entra user directory and license information in export (adds separate EntraUsers file/tab; column count may evolve) + [Parameter(Mandatory = $false)] + [switch]$IncludeUserInfo, + + # Export only Entra user directory and license information (skips all audit log retrieval) + [Parameter(Mandatory = $false)] + [switch]$OnlyUserInfo, + + # Maximum minutes to tolerate continuous network outage during Graph async polling & record retrieval (adaptive backoff). Default 30. + [Parameter(Mandatory = $false)] + [int]$MaxNetworkOutageMinutes = 30, + + # Export Graph API telemetry CSV for partition timing analysis and troubleshooting + [Parameter(Mandatory = $false)] + [switch]$IncludeTelemetry, + + # Resume from checkpoint file - HANDLED VIA $args (not param block) to support: + # -Resume (auto-discover checkpoint in OutputPath) + # -Resume "path/to/file" (explicit checkpoint path) + # This parameter captures any remaining arguments for manual -Resume parsing + [Parameter(Mandatory = $false, ValueFromRemainingArguments = $true)] + [string[]]$RemainingArgs +) + +# DEBUG MARKER (removed to reduce noise) + +# ============================================================ +# MANUAL -Resume PARAMETER PARSING +# Enables: -Resume (auto-discover) and -Resume "path" (explicit) +# ============================================================ +$Resume = $null +$ResumeSpecified = $false +if ($RemainingArgs -and $RemainingArgs.Count -gt 0) { + for ($i = 0; $i -lt $RemainingArgs.Count; $i++) { + if ($RemainingArgs[$i] -eq '-Resume') { + $ResumeSpecified = $true + # Check if next argument exists and is not another parameter + if (($i + 1) -lt $RemainingArgs.Count -and $RemainingArgs[$i + 1] -notmatch '^-') { + $Resume = $RemainingArgs[$i + 1] + $i++ # Skip the path argument + } else { + $Resume = '' # Auto-discover mode + } + } + } +} + +# ============================================================ +# PROMOTE AUTH PARAMETERS TO SCRIPT SCOPE +# Enables access from within functions (e.g., Connect-PurviewAudit) +# ============================================================ +$script:TenantId = $TenantId +$script:ClientId = $ClientId +$script:ClientSecret = $ClientSecret +$script:ClientCertificateThumbprint = $ClientCertificateThumbprint +$script:ClientCertificateStoreLocation = $ClientCertificateStoreLocation +$script:ClientCertificatePath = $ClientCertificatePath +$script:ClientCertificatePassword = $ClientCertificatePassword + + function Resolve-CommaSeparatedValues { + param([string[]]$Values) + + if (-not $Values -or $Values.Count -eq 0) { + return $Values + } + + $finalActivityTypes = @() + + # Step 1: Add explicit -ActivityTypes parameter values (if provided and not default) + if ($PSBoundParameters.ContainsKey('ActivityTypes') -and $ActivityTypes) { + foreach ($actType in $ActivityTypes) { + if ($actType -and $actType -ne '') { + $finalActivityTypes += $actType + } + } + if ($finalActivityTypes.Count -gt 0) { + Write-LogHost "Custom ActivityTypes provided: $($finalActivityTypes -join ', ')" -ForegroundColor Gray + } + } + + # Step 2: Add DSPM for AI activity types if switch enabled + if ($IncludeDSPMForAI) { + $finalActivityTypes += 'ConnectedAIAppInteraction' + $finalActivityTypes += 'AIInteraction' + $finalActivityTypes += 'AIAppInteraction' + Write-LogHost "DSPM for AI: Adding ConnectedAIAppInteraction, AIInteraction, AIAppInteraction (See billing information for details)" -ForegroundColor Cyan + } + + # Step 3: Add CopilotInteraction when explicitly requested + if ($IncludeCopilotInteraction -and -not ($finalActivityTypes -contains $copilotBaseActivityType)) { + $finalActivityTypes += $copilotBaseActivityType + Write-LogHost "IncludeCopilotInteraction: Adding $copilotBaseActivityType (explicit request)" -ForegroundColor Cyan + } + + # Step 4: Add Microsoft 365 usage bundle when requested + Write-LogHost ("IncludeM365Usage switch present: {0}" -f $IncludeM365Usage.IsPresent) -ForegroundColor DarkGray + if ($IncludeM365Usage) { + $finalActivityTypes += $m365UsageActivityBundle + Write-LogHost ("M365 Usage bundle: Adding {0} activity types across Exchange/SharePoint/OneDrive/Teams" -f $m365UsageActivityBundle.Count) -ForegroundColor Cyan + + $RecordTypes = @( + if ($RecordTypes) { $RecordTypes } + $m365UsageRecordBundle + ) | Where-Object { $_ } | Select-Object -Unique + if ($RecordTypes.Count -eq 0) { $RecordTypes = $null } + + # CRITICAL: Do NOT set ServiceTypes for M365 usage mode - Graph API should get ALL workloads in single pass + # Multiple serviceFilter values cause unnecessary workload splits (Exchange, SharePoint, OneDrive, Teams) + # Instead, send NO serviceFilter and let Graph API return all workloads in one query per partition + $ServiceTypes = $null + + if ($RecordTypes) { + Write-LogHost "M365 Usage bundle: RecordTypes => $($RecordTypes -join ', ')" -ForegroundColor Gray + } + Write-LogHost "M365 Usage mode: ServiceTypes => NULL (single workload pass, all services combined)" -ForegroundColor Cyan + } + + # Step 5: BASE ACTIVITY TYPE - Add CopilotInteraction as default base type + # This is the core Microsoft 365 Copilot activity type (FREE, included in M365 Copilot licensing) + # Captures ALL M365 Copilot usage including Teams meetings, Word, Excel, PowerPoint, Outlook, etc. + # Auto-add when: + # 1. User didn't explicitly provide -ActivityTypes parameter (default behavior), OR + # 2. User specified any DSPM switch (implies Copilot context needed) + # Exception: Always respect -ExcludeCopilotInteraction (handled in Step 6) + $userProvidedCustomTypes = $PSBoundParameters.ContainsKey('ActivityTypes') + $userWantsDSPM = $IncludeDSPMForAI + if (-not $ExcludeCopilotInteraction) { + # Auto-add if no custom types provided OR if DSPM switches used (implies Copilot data needed) + if (-not $userProvidedCustomTypes -or $userWantsDSPM) { + # Add CopilotInteraction if not already present + if (-not ($finalActivityTypes -contains $copilotBaseActivityType)) { + $finalActivityTypes = @($copilotBaseActivityType) + $finalActivityTypes + } + } + } + + # Step 6: EXCLUSION OVERRIDE - Remove CopilotInteraction if -ExcludeCopilotInteraction is true + if ($ExcludeCopilotInteraction) { + $finalActivityTypes = $finalActivityTypes | Where-Object { $_ -ne $copilotBaseActivityType } + } + + # Step 7: Final deduplication and validation + $finalActivityTypes = @($finalActivityTypes | Select-Object -Unique) + + return $finalActivityTypes +} + +function Send-PromptNotification { + <# + .SYNOPSIS + Plays a system beep to alert user that a prompt requires attention. + .DESCRIPTION + Useful when user is working in other windows and needs to be notified + when a prompt appears that requires input. + #> + + try { + # Play 3 short beeps to get attention + [Console]::Beep(800, 200) # 800Hz for 200ms + Start-Sleep -Milliseconds 100 + [Console]::Beep(1000, 200) # 1000Hz for 200ms + Start-Sleep -Milliseconds 100 + [Console]::Beep(1200, 300) # 1200Hz for 300ms (slightly longer final beep) + } + catch { + # Silently fail if beep not supported (e.g., some server environments) + } +} + +# Validate -OnlyUserInfo parameter compatibility +if ($OnlyUserInfo) { + $incompatibleParams = @() + + # Date filtering parameters + if ($PSBoundParameters.ContainsKey('StartDate')) { $incompatibleParams += " - StartDate (not applicable for user-only export)" } + if ($PSBoundParameters.ContainsKey('EndDate')) { $incompatibleParams += " - EndDate (not applicable for user-only export)" } + + # Activity configuration parameters + if ($PSBoundParameters.ContainsKey('ActivityTypes')) { $incompatibleParams += " - ActivityTypes (cleared by -OnlyUserInfo)" } + if ($IncludeM365Usage) { $incompatibleParams += " - IncludeM365Usage (activity type modifier)" } + if ($IncludeDSPMForAI) { $incompatibleParams += " - IncludeDSPMForAI (activity type modifier)" } + if ($ExcludeCopilotInteraction) { $incompatibleParams += " - ExcludeCopilotInteraction (activity type modifier)" } + + # Audit retrieval settings + if ($PSBoundParameters.ContainsKey('BlockHours') -and $BlockHours -ne 0.5) { $incompatibleParams += " - BlockHours (audit query partitioning)" } + if ($PSBoundParameters.ContainsKey('PartitionHours') -and $PartitionHours -ne 0) { $incompatibleParams += " - PartitionHours (audit query partitioning)" } + if ($PSBoundParameters.ContainsKey('MaxPartitions') -and $MaxPartitions -ne 160) { $incompatibleParams += " - MaxPartitions (audit query limits)" } + if ($PSBoundParameters.ContainsKey('ResultSize') -and $ResultSize -ne 10000) { $incompatibleParams += " - ResultSize (audit query page size)" } + if ($PSBoundParameters.ContainsKey('PacingMs') -and $PacingMs -ne 0) { $incompatibleParams += " - PacingMs (audit query throttling)" } + if ($AutoCompleteness) { $incompatibleParams += " - AutoCompleteness (audit log completeness checks)" } + if ($PSBoundParameters.ContainsKey('StreamingSchemaSample') -and $StreamingSchemaSample -ne 5000) { $incompatibleParams += " - StreamingSchemaSample (audit record schema sampling)" } + if ($PSBoundParameters.ContainsKey('StreamingChunkSize') -and $StreamingChunkSize -ne 5000) { $incompatibleParams += " - StreamingChunkSize (audit streaming batch size)" } + if ($PSBoundParameters.ContainsKey('ExportProgressInterval') -and $ExportProgressInterval -ne 10) { $incompatibleParams += " - ExportProgressInterval (audit export progress)" } + + # Filtering parameters + if ($PSBoundParameters.ContainsKey('AgentId')) { $incompatibleParams += " - AgentId (audit record filtering)" } + if ($AgentsOnly) { $incompatibleParams += " - AgentsOnly (audit record filtering)" } + if ($ExcludeAgents) { $incompatibleParams += " - ExcludeAgents (audit record filtering)" } + if ($PSBoundParameters.ContainsKey('PromptFilter')) { $incompatibleParams += " - PromptFilter (audit record content filtering)" } + if ($PSBoundParameters.ContainsKey('UserIds')) { $incompatibleParams += " - UserIds (audit record filtering; Entra fetch retrieves all users)" } + if ($PSBoundParameters.ContainsKey('GroupNames')) { $incompatibleParams += " - GroupNames (audit record filtering)" } + if ($PSBoundParameters.ContainsKey('RecordTypes')) { $incompatibleParams += " - RecordTypes (audit record filtering)" } + if ($PSBoundParameters.ContainsKey('ServiceTypes')) { $incompatibleParams += " - ServiceTypes (audit record filtering)" } + + # Processing mode parameters + if ($ExplodeArrays) { $incompatibleParams += " - ExplodeArrays (audit record array expansion)" } + if ($ExplodeDeep) { $incompatibleParams += " - ExplodeDeep (audit record deep expansion)" } + if ($PSBoundParameters.ContainsKey('RAWInputCSV')) { $incompatibleParams += " - RAWInputCSV (offline audit replay mode)" } + + # Parallel processing parameters + if ($EnableParallel) { $incompatibleParams += " - EnableParallel (parallel audit query execution)" } + if ($PSBoundParameters.ContainsKey('MaxConcurrency') -and $MaxConcurrency -ne 10) { $incompatibleParams += " - MaxConcurrency (concurrent query/partition limit)" } + if ($PSBoundParameters.ContainsKey('MaxParallelGroups') -and $MaxParallelGroups -ne 8) { $incompatibleParams += " - MaxParallelGroups (parallel activity group limit)" } + if ($PSBoundParameters.ContainsKey('ParallelMode') -and $ParallelMode -ne 'Auto') { $incompatibleParams += " - ParallelMode (parallel processing mode)" } + if ($DisableAdaptive) { $incompatibleParams += " - DisableAdaptive (adaptive concurrency controls)" } + if ($PSBoundParameters.ContainsKey('ProgressSmoothingAlpha') -and $ProgressSmoothingAlpha -ne 0.3) { $incompatibleParams += " - ProgressSmoothingAlpha (adaptive tuning)" } + if ($PSBoundParameters.ContainsKey('HighLatencyMs') -and $HighLatencyMs -ne 90000) { $incompatibleParams += " - HighLatencyMs (adaptive tuning)" } + if ($PSBoundParameters.ContainsKey('MemoryPressureMB') -and $MemoryPressureMB -ne 1500) { $incompatibleParams += " - MemoryPressureMB (adaptive tuning)" } + if ($PSBoundParameters.ContainsKey('LowLatencyMs') -and $LowLatencyMs -ne 20000) { $incompatibleParams += " - LowLatencyMs (adaptive tuning)" } + if ($PSBoundParameters.ContainsKey('LowLatencyConsecutive') -and $LowLatencyConsecutive -ne 2) { $incompatibleParams += " - LowLatencyConsecutive (adaptive tuning)" } + if ($PSBoundParameters.ContainsKey('ThroughputDropPct') -and $ThroughputDropPct -ne 15) { $incompatibleParams += " - ThroughputDropPct (adaptive tuning)" } + if ($PSBoundParameters.ContainsKey('ThroughputSmoothingAlpha') -and $ThroughputSmoothingAlpha -ne 0.3) { $incompatibleParams += " - ThroughputSmoothingAlpha (adaptive tuning)" } + if ($PSBoundParameters.ContainsKey('AdaptiveConcurrencyCeiling') -and $AdaptiveConcurrencyCeiling -ne 6) { $incompatibleParams += " - AdaptiveConcurrencyCeiling (adaptive tuning)" } + + # Reliability parameters (audit-specific) + if ($PSBoundParameters.ContainsKey('CircuitBreakerThreshold') -and $CircuitBreakerThreshold -ne 5) { $incompatibleParams += " - CircuitBreakerThreshold (block failure circuit breaker)" } + if ($PSBoundParameters.ContainsKey('CircuitBreakerCooldownSeconds') -and $CircuitBreakerCooldownSeconds -ne 120) { $incompatibleParams += " - CircuitBreakerCooldownSeconds (circuit breaker cooldown)" } + if ($PSBoundParameters.ContainsKey('BackoffBaseSeconds') -and $BackoffBaseSeconds -ne 1.0) { $incompatibleParams += " - BackoffBaseSeconds (block retry backoff)" } + if ($PSBoundParameters.ContainsKey('BackoffMaxSeconds') -and $BackoffMaxSeconds -ne 45) { $incompatibleParams += " - BackoffMaxSeconds (block retry max backoff)" } + + # Alternative modes + if ($UseEOM) { $incompatibleParams += " - UseEOM (Exchange Online Management mode incompatible with Graph Entra enrichment)" } + + # Output combination parameters + if ($CombineOutput) { $incompatibleParams += " - CombineOutput (only relevant with multiple activity types)" } + if ($AppendFile) { $incompatibleParams += " - AppendFile (appending user-only data to existing audit output not supported)" } + + if ($incompatibleParams.Count -gt 0) { + Write-Host "" + Write-Host "ERROR: The -OnlyUserInfo switch cannot be used with the following parameters:" -ForegroundColor Red + Write-Host "" + $incompatibleParams | ForEach-Object { Write-Host $_ -ForegroundColor Yellow } + Write-Host "" + Write-Host "The -OnlyUserInfo switch exports only Entra user directory and license information (no audit logs)." -ForegroundColor Cyan + Write-Host "" + Write-Host "Compatible parameters:" -ForegroundColor Green + Write-Host " - OutputPath (where to save the file)" -ForegroundColor White + Write-Host " - Auth (authentication method: WebLogin, DeviceCode, Credential, Silent)" -ForegroundColor White + Write-Host " - ExportWorkbook (export to Excel format)" -ForegroundColor White + Write-Host " - Force (bypass interactive prompts)" -ForegroundColor White + Write-Host " - MaxNetworkOutageMinutes (network resilience for Graph API calls)" -ForegroundColor White + Write-Host " - EmitMetricsJson (track Entra retrieval metrics)" -ForegroundColor White + Write-Host " - MetricsPath (custom metrics output location)" -ForegroundColor White + Write-Host " - SkipDiagnostics (skip pre-query capability checks)" -ForegroundColor White + Write-Host "" + Write-Host "Please remove the incompatible parameters and try again." -ForegroundColor Cyan + Write-Host "" + exit 1 + } + + # If validation passes, configure for user-only export + Write-Host "" + Write-Host "INFO: -OnlyUserInfo mode enabled. Skipping all audit log retrieval, exporting only Entra user data." -ForegroundColor Green + Write-Host "" + $IncludeUserInfo = $true + $ActivityTypes = @() +} + +# Canonical maps for Graph filter normalization +$recordTypeCanonicalMap = @{ + 'azureactivedirectory' = 'AzureActiveDirectory' + 'azureactivedirectoryaccountlogon' = 'AzureActiveDirectoryAccountLogon' + 'azureactivedirectorystslogon' = 'AzureActiveDirectoryStsLogon' + 'exchangeadmin' = 'ExchangeAdmin' + 'exchangeitem' = 'ExchangeItem' + 'exchangemailbox' = 'ExchangeMailbox' + 'sharepointfileoperation' = 'SharePointFileOperation' + 'sharepointsharingoperation' = 'SharePointSharingOperation' + 'sharepoint' = 'SharePoint' + 'onedrive' = 'OneDrive' + 'microsoftteams' = 'MicrosoftTeams' +} + +$serviceCanonicalMap = @{ + 'azureactivedirectory' = 'AzureActiveDirectory' + 'exchange' = 'Exchange' + 'sharepoint' = 'SharePoint' + 'onedrive' = 'OneDrive' + 'teams' = 'Teams' +} + +# Normalize optional Graph filter passthrough parameters (dedupe & trim) +# Split ActivityTypes if provided as comma-separated string +if ($ActivityTypes) { + $processedActivityTypes = New-Object System.Collections.Generic.List[string] + foreach ($value in $ActivityTypes) { + if ($null -eq $value) { continue } + $raw = $value.ToString() + foreach ($piece in ($raw -split ',')) { + $token = $piece.Trim(" '""`t") + if ([string]::IsNullOrWhiteSpace($token)) { continue } + $processedActivityTypes.Add($token) + } + } + $ActivityTypes = @( + $processedActivityTypes | + ForEach-Object { $_.Trim() } | + Where-Object { $_ } | + Select-Object -Unique + ) + if ($ActivityTypes.Count -eq 0) { + $ActivityTypes = $null + } +} + +if ($RecordTypes) { + $processedRecordTypes = New-Object System.Collections.Generic.List[string] + foreach ($value in $RecordTypes) { + if ($null -eq $value) { continue } + $raw = $value.ToString() + foreach ($piece in ($raw -split ',')) { + $token = $piece.Trim(" '""`t") + if ([string]::IsNullOrWhiteSpace($token)) { continue } + $processedRecordTypes.Add($token) + } + } + $RecordTypes = @( + $processedRecordTypes | + ForEach-Object { $_.Trim() } | + Where-Object { $_ } | + Select-Object -Unique + ) + if ($RecordTypes.Count -eq 0) { + $RecordTypes = $null + } else { + $RecordTypes = @( + foreach ($rt in $RecordTypes) { + $key = $rt.ToLowerInvariant() + if ($recordTypeCanonicalMap.ContainsKey($key)) { $recordTypeCanonicalMap[$key] } else { $rt } + } + ) | Select-Object -Unique + } +} + +if ($ServiceTypes) { + $processedServiceTypes = New-Object System.Collections.Generic.List[string] + foreach ($value in $ServiceTypes) { + if ($null -eq $value) { continue } + $raw = $value.ToString() + foreach ($piece in ($raw -split ',')) { + $token = $piece.Trim(" '""`t") + if ([string]::IsNullOrWhiteSpace($token)) { continue } + $processedServiceTypes.Add($token) + } + } + $ServiceTypes = @( + $processedServiceTypes | + ForEach-Object { $_.Trim() } | + Where-Object { $_ } | + Select-Object -Unique + ) + if ($ServiceTypes.Count -eq 0) { + $ServiceTypes = $null + } else { + $ServiceTypes = @( + foreach ($svc in $ServiceTypes) { + $key = $svc.ToLowerInvariant() + if ($serviceCanonicalMap.ContainsKey($key)) { $serviceCanonicalMap[$key] } else { $svc } + } + ) | Select-Object -Unique + } +} + +# Mapping of audit record types to supported workloads for Graph security audit queries +$recordTypeWorkloadMap = @{ + 'azureActiveDirectory' = @('AzureActiveDirectory') + 'azureActiveDirectoryAccountLogon' = @('AzureActiveDirectory') + 'azureActiveDirectoryStsLogon' = @('AzureActiveDirectory') + 'exchangeAdmin' = @('Exchange') + 'exchangeItem' = @('Exchange') + 'exchangeMailbox' = @('Exchange') + 'sharePointFileOperation' = @('SharePoint','OneDrive') + 'sharePointSharingOperation' = @('SharePoint','OneDrive') + 'sharePoint' = @('SharePoint','OneDrive') + 'onedrive' = @('OneDrive') + 'microsoftTeams' = @('Teams') + # M365 usage record types: Process in first workload pass to avoid creating additional passes + # These record types capture cross-workload activities (Office apps, Forms, Stream, Planner, PowerApps) + # Mapping to Exchange ensures they run in the first service-filtered pass + 'officeNative' = @('Exchange') + 'microsoftForms' = @('Exchange') + 'microsoftStream' = @('Exchange') + 'plannerPlan' = @('Exchange') + 'plannerTask' = @('Exchange') + 'powerAppsApp' = @('Exchange') +} + +$serviceOperationMap = @{ + 'AzureActiveDirectory' = @('UserLoggedIn','UserLoginFailed','AdminLoggedIn','ResetUserPassword','AddRegisteredUser','UpdateUser','ChangedUserSetting') + 'Exchange' = @('MailItemsAccessed','Send','SendOnBehalf','SoftDelete','HardDelete','MoveToDeletedItems','CopyToFolder','AddMailboxPermission','RemoveMailboxPermission') + 'SharePoint' = @('FileAccessed','FileDownloaded','FileUploaded','FileModified','FileDeleted','FileMoved','SharingInvitationCreated','SharingInvitationAccepted','SharedLinkCreated','SharingRevoked','AddMemberToUnifiedGroup','RemoveMemberFromUnifiedGroup') + 'OneDrive' = @('FileAccessed','FileDownloaded','FileUploaded','FileModified','FileDeleted','FileMoved','SharingInvitationCreated','SharingInvitationAccepted','SharedLinkCreated','SharingRevoked','AddMemberToUnifiedGroup','RemoveMemberFromUnifiedGroup') + 'Teams' = @('TeamMemberAdded','TeamMemberRemoved','ChannelAdded','ChannelDeleted','ChannelMessageSent','ChannelMessageDeleted','TeamDeleted','TeamArchived','AddMemberToUnifiedGroup','RemoveMemberFromUnifiedGroup') + 'MicrosoftForms' = @('CreateForm','EditForm','DeleteForm','ViewForm','CreateResponse','SubmitResponse','ViewResponse','DeleteResponse') + 'MicrosoftStream' = @('StreamModified','StreamViewed','StreamDeleted','StreamDownloaded') + 'MicrosoftPlanner' = @('PlanCreated','PlanDeleted','PlanModified','TaskCreated','TaskDeleted','TaskModified','TaskAssigned','TaskCompleted') + 'PowerApps' = @('LaunchedApp','CreatedApp','EditedApp','DeletedApp','PublishedApp') +} + +$copilotBaseActivityType = 'CopilotInteraction' +$m365UsageServiceBundle = @('Exchange','SharePoint','OneDrive','Teams') +$m365UsageRecordBundle = @('ExchangeAdmin','ExchangeItem','ExchangeMailbox','SharePointFileOperation','SharePointSharingOperation','SharePoint','OneDrive','MicrosoftTeams','OfficeNative','MicrosoftForms','MicrosoftStream','PlannerPlan','PlannerTask','PowerAppsApp') +# Curated M365 usage operations spanning Exchange/SharePoint/OneDrive/Teams/Forms/Stream/Planner/PowerApps and Office desktop apps (Word/Excel/PowerPoint/OneNote) +$m365UsageActivityBundle = @( + # === Exchange/Email === + 'MailItemsAccessed','Send','SendOnBehalf','SoftDelete','HardDelete','MoveToDeletedItems','CopyToFolder', + + # === SharePoint/OneDrive - Files === + 'FileAccessed','FileDownloaded','FileUploaded','FileModified','FileDeleted','FileMoved', + 'FileCheckedIn','FileCheckedOut','FileRecycled','FileRestored','FileVersionsAllDeleted', + + # === SharePoint/OneDrive - Sharing === + 'SharingInvitationCreated','SharingInvitationAccepted','SharedLinkCreated','SharingRevoked', + 'RemovedFromSecureLink', + + # === Groups/Unified Groups === + 'AddMemberToUnifiedGroup','RemoveMemberFromUnifiedGroup', + + # === Teams - Team/Channel management === + 'TeamCreated','TeamDeleted','TeamArchived','TeamSettingChanged', + 'TeamMemberAdded','TeamMemberRemoved','MemberAdded','MemberRemoved','MemberRoleChanged', + 'ChannelAdded','ChannelDeleted','ChannelSettingChanged','ChannelOwnerResponded', + 'ChannelMessageSent','ChannelMessageDeleted', + 'BotAddedToTeam','BotRemovedFromTeam', + 'TabAdded','TabRemoved','TabUpdated', + 'ConnectorAdded','ConnectorRemoved','ConnectorUpdated', + + # === Teams - Chat/Messaging (1:1 and group chats) === + 'TeamsSessionStarted', + 'ChatCreated','ChatRetrieved','ChatUpdated', + 'MessageSent','MessageRead','MessageDeleted','MessageUpdated','MessagesListed', + 'MessageCreation','MessageCreatedHasLink','MessageEditedHasLink', + 'MessageHostedContentRead','MessageHostedContentsListed', + 'SensitiveContentShared', + + # === Teams - Meeting lifecycle === + 'MeetingCreated','MeetingUpdated','MeetingDeleted', + 'MeetingStarted','MeetingEnded', + 'MeetingParticipantJoined','MeetingParticipantLeft','MeetingParticipantRoleChanged', + 'MeetingRecordingStarted','MeetingRecordingEnded', + 'MeetingDetail','MeetingParticipantDetail', + 'LiveNotesUpdate','AINotesUpdate', + 'RecordingExported','TranscriptsExported', + + # === Teams - Apps/Approvals === + 'AppInstalled','AppUpgraded','AppUninstalled', + 'CreatedApproval','ApprovedRequest','RejectedApprovalRequest','CanceledApprovalRequest', + + # === Office apps (Word, Excel, PowerPoint, etc.) === + 'Create','Edit','Open','Save','Print', + + # === Microsoft Forms === + 'CreateForm','EditForm','DeleteForm','ViewForm','CreateResponse','SubmitResponse','ViewResponse','DeleteResponse', + + # === Microsoft Stream === + 'StreamModified','StreamViewed','StreamDeleted','StreamDownloaded', + + # === Planner === + 'PlanCreated','PlanDeleted','PlanModified','TaskCreated','TaskDeleted','TaskModified','TaskAssigned','TaskCompleted', + + # === Power Apps === + 'LaunchedApp','CreatedApp','EditedApp','DeletedApp','PublishedApp', + + # === Copilot === + 'CopilotInteraction' +) | Select-Object -Unique + +# Script version constant (must appear after param/help to keep param() valid as first executable block) +$ScriptVersion = '1.10.8' + +# --- Initialize/Clear persistent script variables to prevent cross-run contamination --- +# Note: Script-scoped variables persist across multiple script invocations in the same PowerShell session +$script:partitionStatus = $null +$script:processedJobIds = $null +$script:shownJobMessages = $null + +# --- Known Microsoft 365 Copilot SKU IDs --- +# Source: PAX Graph Audit Log Processor + Microsoft official SKU documentation +$script:CopilotSkuIds = @{ + 'c815c93d-0759-4bb8-b857-bc921a71be83' = 'Microsoft 365 Copilot' # M365 Copilot + '06ebc4ee-1bb5-47dd-8120-11324bc54e06' = 'Microsoft 365 Copilot' # M365 Copilot (alternative) + 'a1c5e422-7c00-4433-a276-0f5b5f02e952' = 'Copilot Pro' # Copilot Pro + '4a51bca5-1eff-43f5-878c-177680f191af' = 'Microsoft Copilot for Microsoft 365' # Another variant + 'f841e8a7-8d86-4eae-af8c-d14b2a4c7228' = 'Microsoft 365 Copilot' # Additional variant + 'd814ea5e-2d90-455a-8b9e-2e5e4f3e8e8d' = 'Microsoft Copilot for M365' # Additional variant + '440eaaa8-b3e0-484b-a8be-62870b9ba70a' = 'Microsoft 365 Copilot' # Detected from tenant usage + # Additional SKUs from Microsoft official documentation (https://learn.microsoft.com/licensing-service-plan-reference) + 'ad9c22b3-52d7-4e7e-973c-88121ea96436' = 'Microsoft 365 Copilot (Education Faculty)' # EDU Faculty + '15f2e9fc-b782-4f73-bf51-81d8b7fff6f4' = 'Microsoft Copilot for Sales' # Sales Copilot + '639dec6b-bb19-468b-871c-c5c441c4b0cb' = 'Copilot for Microsoft 365' # Official product name variant +} + +# --- DSPM for AI: Synchronized Timestamp & OutputPath Validation --- + +# Generate synchronized timestamp for all output files in this run +$global:ScriptRunTimestamp = Get-Date -Format 'yyyyMMdd_HHmmss' + +# --- Logging Helper Functions (defined early for use throughout script) --- +# Log file path will be set after OutputFile resolution; buffer early entries. +$script:LogFile = $null +$script:LogBuffer = New-Object System.Collections.Generic.List[string] + +function Write-Log { + param([Parameter(Mandatory = $true)][string]$Message, [string]$Level = "INFO") + $timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss" + $logEntry = "[$timestamp] [$Level] $Message" + Microsoft.PowerShell.Utility\Write-Host $Message + try { + if ($script:LogFile) { Add-Content -Path $script:LogFile -Value $logEntry -Encoding UTF8 -ErrorAction SilentlyContinue } + else { $script:LogBuffer.Add($logEntry) | Out-Null } + } catch {} +} + +function Write-LogHost { + param([Parameter(Mandatory = $true)][AllowEmptyString()][string]$Message, [string]$ForegroundColor = "White") + Microsoft.PowerShell.Utility\Write-Host $Message -ForegroundColor $ForegroundColor + try { + $timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss" + $logEntry = "[$timestamp] [INFO] $Message" + if ($script:LogFile) { Add-Content -Path $script:LogFile -Value $logEntry -Encoding UTF8 -ErrorAction SilentlyContinue } + else { $script:LogBuffer.Add($logEntry) | Out-Null } + } catch {} +} + +# Mirror Write-Host to log file with matching signature +function global:Write-Host { + [CmdletBinding()] + param( + [Parameter(Position=0, ValueFromPipeline=$true, ValueFromRemainingArguments=$true)] + $Object, + [object] $Separator, + [ConsoleColor] $ForegroundColor, + [ConsoleColor] $BackgroundColor, + [switch] $NoNewLine + ) + process { + Microsoft.PowerShell.Utility\Write-Host @PSBoundParameters + try { + # Compose message + $msgItems = @($Object) + $msg = ($msgItems | Out-String).TrimEnd() + if ($msg) { + $timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss" + $entry = "[$timestamp] [INFO] $msg" + if ($script:LogFile) { Add-Content -Path $script:LogFile -Value $entry -Encoding UTF8 -ErrorAction SilentlyContinue } + else { $script:LogBuffer.Add($entry) | Out-Null } + } + } catch {} + } +} + +function Get-MaskedUsername { + <# + .SYNOPSIS + Masks a username or email address for secure display in logs and screenshots. + + .DESCRIPTION + Converts "admin@contoso.com" to "a******n@contoso.com" to prevent accidental + credential exposure in terminal output, screenshots, or log files. + + Preserves first and last character of local part, masks middle with 6 asterisks. + Returns original string if input is null, empty, or doesn't contain "@". + + .PARAMETER Username + The username or email address to mask + + .OUTPUTS + Masked string (e.g., "a******n@contoso.com") + + .EXAMPLE + Get-MaskedUsername -Username "admin@contoso.com" + Returns: "a******n@contoso.com" + #> + + param( + [Parameter(Mandatory = $false)] + [string]$Username + ) + + if ([string]::IsNullOrWhiteSpace($Username)) { + return $Username + } + + # Only mask if it looks like an email address + if ($Username -notmatch '@') { + return $Username + } + + $parts = $Username -split '@' + if ($parts.Count -ne 2) { + return $Username + } + + $localPart = $parts[0] + $domain = $parts[1] + + # Handle very short usernames + if ($localPart.Length -le 2) { + return "$($localPart[0])******@$domain" + } + + $first = $localPart[0] + $last = $localPart[$localPart.Length - 1] + $masked = "$first******$last@$domain" + + return $masked +} + +# --- Helper Function: Detect if PAYG billing is configured in tenant --- +function Test-PAYGBillingEnabled { + <# + .SYNOPSIS + Attempts to detect if Microsoft Purview PAYG billing is configured in the tenant. + + .DESCRIPTION + Checks for indicators that PAYG billing is enabled: + - Attempts to query audit records for AIAppInteraction type (PAYG-only) + - If records exist or query succeeds without billing errors, PAYG is likely enabled + - Returns $null if detection is inconclusive + + .OUTPUTS + $true if PAYG billing appears to be enabled + $false if PAYG billing appears to be disabled or not configured + $null if detection is inconclusive (requires actual query attempt) + #> + + # Note: The most reliable way to detect PAYG is to attempt a query for AIAppInteraction + # and check for specific error responses. However, this requires actual data/timeframe. + # For now, we return $null to indicate "unknown" and let the post-query detection handle it. + + Write-LogHost "PAYG billing detection: Deferred to post-query validation" -ForegroundColor DarkGray + return $null +} + +# --- Helper Function: Determine if DSPM features are being used beyond default M365 Copilot types --- +function Test-DSPMFeaturesEnabled { + <# + .SYNOPSIS + Determines if DSPM for AI features are being used beyond the default M365 Copilot activity types. + + .DESCRIPTION + Returns $true if -IncludeDSPMForAI switch is enabled. + This adds: ConnectedAIAppInteraction, AIInteraction, AIAppInteraction + + Returns $false if only default M365 Copilot type (CopilotInteraction) is being queried. + #> + return $IncludeDSPMForAI +} + +# --- Excel Export Validation --- + +# AppendFile with Excel requires ExportWorkbook +if ($AppendFile -and $ExportWorkbook -eq $false) { + # AppendFile is fine for CSV (will be handled later), but needs validation for Excel intent + # This check is only if user explicitly wants Excel append +} + +if ($AppendFile -and -not $ExportWorkbook -and -not $PSBoundParameters.ContainsKey('ExportWorkbook')) { + # User wants AppendFile but didn't specify format - this is OK, will default to CSV append +} + +# Log export mode +if ($ExportWorkbook) { + # Determine Excel output mode based on -CombineOutput parameter + if ($CombineOutput) { + Write-Host "Excel export mode: Combined activity tab + separate EntraUsers tab (if requested)" -ForegroundColor Cyan + } else { + # Default for Excel: separated tabs + Write-Host "Excel export mode: Multi-tab workbook (one tab per activity type)" -ForegroundColor Cyan + } + + if ($AppendFile) { + Write-Host "Append mode: Enabled (will validate existing workbook structure)" -ForegroundColor Cyan + } + Write-Host "" +} else { + # CSV export mode + if ($OnlyUserInfo) { + # OnlyUserInfo mode: No activity files, just Entra user data + Write-Host "CSV export mode: Entra user directory and licensing data only (no audit logs)" -ForegroundColor Cyan + } else { + # Determine CSV output mode based on -CombineOutput parameter + if ($RAWInputCSV -and -not $CombineOutput.IsPresent) { $CombineOutput = [System.Management.Automation.SwitchParameter]::new($true) } + if ($CombineOutput.IsPresent -or $RAWInputCSV) { + # User specified -CombineOutput switch: combine all activity types + $csvModeMsg = "Combined activity file" + if ($IncludeUserInfo) { $csvModeMsg += " + separate EntraUsers file" } + Write-Host "CSV export mode: $csvModeMsg" -ForegroundColor Cyan + } else { + # Default for live CSV: separate files per activity type + $csvModeMsg = "Separate activity files (one per activity type)" + if ($IncludeUserInfo) { $csvModeMsg += " + EntraUsers file" } + Write-Host "CSV export mode: $csvModeMsg" -ForegroundColor Cyan + } + } + Write-Host "" +} + +# Validate OutputPath is folder only (no filenames) +if ($OutputPath) { + # Check if path contains file extension or appears to be a filename + if ($OutputPath -match '\.[a-zA-Z0-9]{2,4}$' -or ($OutputPath -notmatch '[\\/]$' -and (Split-Path -Leaf $OutputPath) -match '\.')) { + Write-Host "ERROR: OutputPath must be a folder path only. Custom filenames are not supported." -ForegroundColor Red + Write-Host "The script will automatically generate timestamped filenames based on activity types." -ForegroundColor Yellow + Write-Host "" -ForegroundColor Yellow + Write-Host "Example valid paths:" -ForegroundColor Green + Write-Host " -OutputPath 'C:\Temp\'" -ForegroundColor Green + Write-Host " -OutputPath 'D:\AuditLogs\'" -ForegroundColor Green + Write-Host "" -ForegroundColor Yellow + Write-Host "Example INVALID path:" -ForegroundColor Red + Write-Host " -OutputPath 'C:\Temp\myfile.csv'" -ForegroundColor Red + exit 1 + } + + # Ensure OutputPath ends with backslash + if (-not $OutputPath.EndsWith('\')) { + $OutputPath = $OutputPath + '\' + } + + # Create directory if it doesn't exist + if (-not (Test-Path -Path $OutputPath -PathType Container)) { + try { + New-Item -Path $OutputPath -ItemType Directory -Force | Out-Null + Write-Host "INFO: Created output directory: $OutputPath" -ForegroundColor Green + } + catch { + Write-Host "ERROR: Failed to create output directory: $OutputPath" -ForegroundColor Red + Write-Host "Error: $_" -ForegroundColor Red + exit 1 + } + } +} + +# Validate AppendFile is not used with EntraUsers export modes +if ($AppendFile -and ($IncludeUserInfo -or $OnlyUserInfo)) { + Write-Host "ERROR: -AppendFile cannot be used with EntraUsers export modes" -ForegroundColor Red + Write-Host "" -ForegroundColor Yellow + Write-Host "EntraUsers data is always overwritten (never appended) because it represents" -ForegroundColor Yellow + Write-Host "a point-in-time snapshot of your tenant's user information, not time-based" -ForegroundColor Yellow + Write-Host "activity data. Each export should create a fresh EntraUsers dataset." -ForegroundColor Yellow + Write-Host "" -ForegroundColor Yellow + Write-Host "Solutions:" -ForegroundColor Green + Write-Host " 1. Remove -IncludeUserInfo or -OnlyUserInfo to append activity data only" -ForegroundColor Green + Write-Host " 2. Run without -AppendFile to create new timestamped files" -ForegroundColor Green + exit 1 +} + +# Validate AppendFile has proper filename format +if ($AppendFile) { + # Check if it's a directory path (ends with slash/backslash or has no extension) + if ($AppendFile -match '[\\/]$') { + Write-Host "ERROR: -AppendFile must specify a filename, not a directory path" -ForegroundColor Red + Write-Host "" -ForegroundColor Yellow + Write-Host "Valid examples:" -ForegroundColor Green + Write-Host " -AppendFile 'MyReport.xlsx'" -ForegroundColor Green + Write-Host " -AppendFile 'C:\Data\Audit\Report.csv'" -ForegroundColor Green + Write-Host "" -ForegroundColor Yellow + Write-Host "Invalid examples:" -ForegroundColor Red + Write-Host " -AppendFile 'C:\Data\'" -ForegroundColor Red + Write-Host " -AppendFile 'C:\Data\Audit\'" -ForegroundColor Red + exit 1 + } + + # Extract file extension + $appendExt = [System.IO.Path]::GetExtension($AppendFile).ToLower() + + # Validate extension exists + if (-not $appendExt) { + Write-Host "ERROR: -AppendFile must include a file extension (.csv or .xlsx)" -ForegroundColor Red + Write-Host "" -ForegroundColor Yellow + Write-Host "Valid examples:" -ForegroundColor Green + Write-Host " -AppendFile 'MyReport.xlsx'" -ForegroundColor Green + Write-Host " -AppendFile 'AuditData.csv'" -ForegroundColor Green + exit 1 + } + + # Validate extension matches export mode + if ($ExportWorkbook -and $appendExt -ne '.xlsx') { + Write-Host "ERROR: -AppendFile must use .xlsx extension when -ExportWorkbook is specified" -ForegroundColor Red + Write-Host "You specified: $AppendFile" -ForegroundColor Yellow + Write-Host "" -ForegroundColor Yellow + Write-Host "Solutions:" -ForegroundColor Green + Write-Host " 1. Change filename to use .xlsx extension" -ForegroundColor Green + Write-Host " 2. Remove -ExportWorkbook to append CSV data instead" -ForegroundColor Green + exit 1 + } + elseif (-not $ExportWorkbook -and $appendExt -ne '.csv') { + Write-Host "ERROR: -AppendFile must use .csv extension for CSV mode" -ForegroundColor Red + Write-Host "You specified: $AppendFile" -ForegroundColor Yellow + Write-Host "" -ForegroundColor Yellow + Write-Host "Solutions:" -ForegroundColor Green + Write-Host " 1. Change filename to use .csv extension" -ForegroundColor Green + Write-Host " 2. Add -ExportWorkbook to append Excel data instead" -ForegroundColor Green + exit 1 + } +} + +# --- DSPM for AI: Conflict Detection for -ExcludeCopilotInteraction --- + +$script:ConflictResolved = $false +$script:ConflictChoice = $null + +# Detect conflicts where user wants to both include AND exclude CopilotInteraction: +# 1. Explicit include via -ActivityTypes parameter +# 2. Explicit include via -IncludeCopilotInteraction switch +# 3. Implicit include via DSPM switches (which require CopilotInteraction for context) +$explicitInclude = $ActivityTypes -and ($ActivityTypes -contains 'CopilotInteraction') +$explicitIncludeViaSwitch = $IncludeCopilotInteraction +$implicitIncludeViaDSPM = $IncludeDSPMForAI + +if ($ExcludeCopilotInteraction -and ($explicitInclude -or $explicitIncludeViaSwitch -or $implicitIncludeViaDSPM)) { + if (-not $Force) { + Write-Host "" + Write-Host "============================================================================================================" -ForegroundColor Yellow + Write-Host "CONFLICT DETECTED" -ForegroundColor Red + Write-Host "============================================================================================================" -ForegroundColor Yellow + Write-Host "" + + if ($explicitInclude) { + Write-Host "You provided 'CopilotInteraction' in -ActivityTypes but also specified -ExcludeCopilotInteraction switch." -ForegroundColor Yellow + } + elseif ($explicitIncludeViaSwitch) { + Write-Host "You enabled -IncludeCopilotInteraction but also specified -ExcludeCopilotInteraction." -ForegroundColor Yellow + } + elseif ($implicitIncludeViaDSPM) { + Write-Host "You specified DSPM switches (which require M365 Copilot data for context) but also specified -ExcludeCopilotInteraction." -ForegroundColor Yellow + Write-Host "DSPM switches enabled: -IncludeDSPMForAI" -ForegroundColor Cyan + } + + Write-Host "" + Write-Host "Microsoft 365 Copilot data (CopilotInteraction) includes:" -ForegroundColor Cyan + Write-Host " - M365 Copilot (Word, Excel, PowerPoint, Outlook, Teams meetings, etc.)" -ForegroundColor Cyan + Write-Host " - Microsoft 365 Copilot Chat (Office.com)" -ForegroundColor Cyan + Write-Host " - Security Copilot" -ForegroundColor Cyan + Write-Host " - Copilot Studio interactions" -ForegroundColor Cyan + Write-Host " - Billing: FREE (included with E5/Audit Standard)" -ForegroundColor Green + Write-Host "" + Write-Host "Do you want to INCLUDE or EXCLUDE Microsoft 365 Copilot activity type?" -ForegroundColor Yellow + Write-Host " [I] INCLUDE - Proceed with M365 Copilot data enabled (override -ExcludeCopilotInteraction switch)" -ForegroundColor Green + Write-Host " [E] EXCLUDE - Remove CopilotInteraction (honor -ExcludeCopilotInteraction switch)" -ForegroundColor Red + Write-Host "" + + Send-PromptNotification + $userChoice = Read-Host "Enter your choice (I/E)" + + if ($userChoice -eq 'I' -or $userChoice -eq 'i') { + $ExcludeCopilotInteraction = $false + $script:ConflictChoice = 'INCLUDE' + Write-Host "" + Write-Host "Choice: INCLUDE - Proceeding with CopilotInteraction enabled" -ForegroundColor Green + Write-Host "" + } + elseif ($userChoice -eq 'E' -or $userChoice -eq 'e') { + $script:ConflictChoice = 'EXCLUDE' + Write-Host "" + Write-Host "Choice: EXCLUDE - CopilotInteraction will be removed from ActivityTypes" -ForegroundColor Red + Write-Host "" + } + else { + Write-Host "" + Write-Host "ERROR: Invalid choice. Please enter 'I' for INCLUDE or 'E' for EXCLUDE." -ForegroundColor Red + exit 1 + } + + $script:ConflictResolved = $true + } + else { + # Force mode - honor ExcludeCopilotInteraction without prompt + $script:ConflictChoice = 'EXCLUDE (Force mode)' + $script:ConflictResolved = $true + } +} + +# ============================================== +# ImportExcel Module Check (for Excel export) +# ============================================== + +if ($ExportWorkbook) { + Write-Host "Checking ImportExcel module for Excel export..." -ForegroundColor Cyan + + $importExcelModule = Get-Module -ListAvailable -Name ImportExcel | Select-Object -First 1 + if (-not $importExcelModule) { + Write-Host "ImportExcel module not found (required for -ExportWorkbook)." -ForegroundColor Yellow + Write-Host "Installing ImportExcel module..." -ForegroundColor Yellow + Write-Host "" + + try { + Install-Module ImportExcel -Scope CurrentUser -Force -AllowClobber -Repository PSGallery -ErrorAction Stop + Write-Host "ImportExcel module installed successfully!" -ForegroundColor Green + Write-Host "" + + # Re-check for the module + $importExcelModule = Get-Module -ListAvailable -Name ImportExcel | Select-Object -First 1 + if (-not $importExcelModule) { + Write-Host "ERROR: Module installation completed but module not found. Try restarting PowerShell." -ForegroundColor Red + exit 1 + } + } + catch { + Write-Host "ERROR: Failed to install ImportExcel module: $($_.Exception.Message)" -ForegroundColor Red + Write-Host "" + Write-Host "Please install manually using:" -ForegroundColor Yellow + Write-Host " Install-Module ImportExcel -Scope CurrentUser" -ForegroundColor Yellow + Write-Host "" + Write-Host "Falling back to CSV export..." -ForegroundColor Yellow + $script:ExportWorkbook = $false + $script:AppendFile = $false + } + } + else { + Write-Host "ImportExcel module detected: $($importExcelModule.Name) v$($importExcelModule.Version)" -ForegroundColor Green + } + + # Import ImportExcel module + if ($ExportWorkbook) { + try { + Import-Module ImportExcel -ErrorAction Stop + Write-Host "ImportExcel module imported successfully" -ForegroundColor Green + Write-Host "" + } + catch { + Write-Host "ERROR: Failed to import ImportExcel module: $($_.Exception.Message)" -ForegroundColor Red + Write-Host "Falling back to CSV export..." -ForegroundColor Yellow + $script:ExportWorkbook = $false + $script:AppendFile = $false + } + } +} + +# --- Early parameter validation & environment sanity checks --- + +# PowerShell 5.1 requires -UseEOM mode (Graph API mode requires PS 7+ for ThreadJob parallelism) +if ($PSVersionTable.PSVersion.Major -lt 7 -and -not $UseEOM -and -not $RAWInputCSV -and -not $Resume) { + Write-Host "" -ForegroundColor Red + Write-Host "═══════════════════════════════════════════════════════════════════════════════" -ForegroundColor Red + Write-Host " ERROR: PowerShell 5.1 Detected - Graph API Mode Not Supported" -ForegroundColor Red + Write-Host "═══════════════════════════════════════════════════════════════════════════════" -ForegroundColor Red + Write-Host "" + Write-Host " The default Microsoft Graph API mode requires PowerShell 7+ for parallel query execution." -ForegroundColor Yellow + Write-Host " PowerShell 5.1 is supported, but requires -UseEOM (Exchange Online Management) mode." -ForegroundColor Yellow + Write-Host "" + Write-Host " SOLUTION: Add the -UseEOM switch to your command:" -ForegroundColor Cyan + Write-Host "" + Write-Host " .\PAX_Purview_Audit_Log_Processor.ps1 -UseEOM [your other parameters]" -ForegroundColor White + Write-Host "" + Write-Host " OR upgrade to PowerShell 7+ for Graph API mode (recommended for performance):" -ForegroundColor Cyan + Write-Host " https://aka.ms/powershell" -ForegroundColor White + Write-Host "" + Write-Host " EOM MODE NOTES:" -ForegroundColor DarkCyan + Write-Host " - Uses Search-UnifiedAuditLog cmdlet (serial processing)" -ForegroundColor Gray + Write-Host " - Requires Exchange Online Management module" -ForegroundColor Gray + Write-Host " - Requires Exchange Admin role or audit log read permissions" -ForegroundColor Gray + Write-Host " - Some features (Entra user enrichment) not available in EOM mode" -ForegroundColor Gray + Write-Host "" + Write-Host "═══════════════════════════════════════════════════════════════════════════════" -ForegroundColor Red + exit 1 +} + +if ($ExcludeAgents -and ($AgentId -or $AgentsOnly)) { + Write-Host "ERROR: -ExcludeAgents cannot be used with -AgentId or -AgentsOnly switches." -ForegroundColor Red + Write-Host "These switches are mutually exclusive:" -ForegroundColor Yellow + Write-Host " -AgentId/-AgentsOnly: Filter to ONLY records with agents" -ForegroundColor Yellow + Write-Host " -ExcludeAgents: Filter to ONLY records without agents" -ForegroundColor Yellow + Write-Host "Please use only one filtering approach and re-run." -ForegroundColor Yellow + exit 1 +} + +# Validate MaxConcurrency range (Microsoft Purview enforces 10 concurrent search job limit per user account) +# ============================================================================ +# APPENDFILE COLUMN VALIDATION FUNCTIONS +# ============================================================================ +# Early validation to prevent wasting time on Graph API queries when explosion +# parameters don't match between existing file and new data parameters. +# Note: For explosion modes, actual column schemas are dynamic and vary by data, +# so we validate explosion parameter compatibility rather than exact columns. +# ============================================================================ + +function Get-LikelyExplosionParams { + param([string[]]$Columns) + + # Check for deep explosion indicators (CopilotEventData.* columns) + $hasDeepColumns = $Columns | Where-Object { $_ -match '^CopilotEventData\.' } + if ($hasDeepColumns) { + return @{ Mode = "ExplodeDeep"; DisplayName = "-ExplodeDeep" } + } + + # Check for array explosion indicators (exploded field names like Message_, Context_, AgentId, etc.) + $hasArrayColumns = $Columns | Where-Object { $_ -match '^(Message_|Context_|Interaction_|AgentId|AgentName|AgentVersion|AccessedResource_|AISystemPlugin_)' } + if ($hasArrayColumns) { + return @{ Mode = "ExplodeArrays"; DisplayName = "-ExplodeArrays" } + } + + # Check for standard mode indicators (AuditData JSON column present) + $hasAuditData = $Columns -contains 'AuditData' + if ($hasAuditData) { + return @{ Mode = "Standard"; DisplayName = "Standard (no explosion)" } + } + + # Unable to determine + return @{ Mode = "Unknown"; DisplayName = "Unknown mode" } +} + +function Test-AppendFileCompatibility { + param( + [string]$FilePath, + [bool]$IsExcel, + [bool]$ExplodeArrays, + [bool]$ExplodeDeep, + [string]$TargetSheet = $null + ) + + $result = @{ + Compatible = $true + ExistingMode = $null + CurrentMode = $null + ExistingColumns = @() + ExistingCount = 0 + ErrorMessage = $null + } + + try { + # Determine current explosion mode + if ($ExplodeDeep) { + $result.CurrentMode = @{ Mode = "ExplodeDeep"; DisplayName = "-ExplodeDeep" } + } + elseif ($ExplodeArrays) { + $result.CurrentMode = @{ Mode = "ExplodeArrays"; DisplayName = "-ExplodeArrays" } + } + else { + $result.CurrentMode = @{ Mode = "Standard"; DisplayName = "Standard (no explosion)" } + } + + # Read existing file columns + if ($IsExcel) { + # Validate Excel file and read columns + if (-not (Get-Module -Name ImportExcel -ListAvailable)) { + $result.ErrorMessage = "ImportExcel module not available for validation" + $result.Compatible = $false + return $result + } + + Import-Module ImportExcel -ErrorAction Stop + + # Get sheet info + $sheets = Get-ExcelSheetInfo -Path $FilePath -ErrorAction Stop + + if ($TargetSheet) { + # Validate specific sheet + $sheet = $sheets | Where-Object { $_.Name -eq $TargetSheet } + if (-not $sheet) { + $result.ErrorMessage = "Target sheet '$TargetSheet' not found in workbook" + $result.Compatible = $false + return $result + } + } + else { + # Use first sheet + $sheet = $sheets | Select-Object -First 1 + } + + # Read header row from Excel + $headerData = Import-Excel -Path $FilePath -WorksheetName $sheet.Name -StartRow 1 -EndRow 1 -NoHeader -ErrorAction Stop + $existingCols = $headerData[0].PSObject.Properties.Value | Where-Object { $_ } + } + else { + # CSV: Read first line (header) + $firstLine = Get-Content -Path $FilePath -First 1 -Encoding UTF8 -ErrorAction Stop + $existingCols = ($firstLine -split ',') | ForEach-Object { $_.Trim('"') } + } + + $result.ExistingColumns = $existingCols + $result.ExistingCount = $existingCols.Count + + # Detect explosion mode of existing file + $result.ExistingMode = Get-LikelyExplosionParams -Columns $existingCols + + # Check if explosion modes match + if ($result.ExistingMode.Mode -ne $result.CurrentMode.Mode) { + $result.Compatible = $false + $result.ErrorMessage = "Explosion parameter mismatch: existing file is '$($result.ExistingMode.DisplayName)' but current command uses '$($result.CurrentMode.DisplayName)'" + } + else { + # Modes match - compatible + # Note: We don't validate exact columns because explosion schemas are dynamic + # and vary based on actual data content. As long as explosion params match, + # the append will work correctly. + $result.Compatible = $true + } + } + catch { + $result.ErrorMessage = $_.Exception.Message + $result.Compatible = $false + } + + return $result +} + +# ============================================================================ +# END APPENDFILE VALIDATION FUNCTIONS +# ============================================================================ + +if ($MaxConcurrency -lt 1 -or $MaxConcurrency -gt 10) { + Write-Host "ERROR: -MaxConcurrency must be between 1 and 10." -ForegroundColor Red + Write-Host "Microsoft Purview enforces a maximum of 10 concurrent search jobs per user account." -ForegroundColor Yellow + Write-Host "Current value: $MaxConcurrency" -ForegroundColor Yellow + Write-Host "Please specify a value between 1 and 10 and re-run." -ForegroundColor Yellow + exit 1 +} + +# Establish date defaults / validation depending on mode. +if ($RAWInputCSV) { + $parsedStart = $null; $parsedEnd = $null + if ($PSBoundParameters.ContainsKey('StartDate')) { + try { $parsedStart = [datetime]::ParseExact($StartDate, 'yyyy-MM-dd', $null) } catch { Write-Host "ERROR: StartDate must be yyyy-MM-dd if provided." -ForegroundColor Red; exit 1 } + } + if ($PSBoundParameters.ContainsKey('EndDate')) { + try { $parsedEnd = [datetime]::ParseExact($EndDate, 'yyyy-MM-dd', $null) } catch { Write-Host "ERROR: EndDate must be yyyy-MM-dd if provided." -ForegroundColor Red; exit 1 } + } + if ($parsedStart -and $parsedEnd -and $parsedEnd -lt $parsedStart) { Write-Host "ERROR: EndDate ($EndDate) is earlier than StartDate ($StartDate)." -ForegroundColor Red; exit 1 } + if (-not $PSBoundParameters.ContainsKey('StartDate')) { $StartDate = '*' } + if (-not $PSBoundParameters.ContainsKey('EndDate')) { $EndDate = '*' } +} +else { + if (-not $PSBoundParameters.ContainsKey('StartDate') -and -not $PSBoundParameters.ContainsKey('EndDate')) { + $yesterdayUtc = (Get-Date).ToUniversalTime().Date.AddDays(-1) + $StartDate = $yesterdayUtc.ToString('yyyy-MM-dd') + $EndDate = $yesterdayUtc.AddDays(1).ToString('yyyy-MM-dd') + } + elseif (-not $PSBoundParameters.ContainsKey('StartDate')) { + $StartDate = '*' + try { + $parsedEnd = [datetime]::ParseExact($EndDate, 'yyyy-MM-dd', $null) + } catch { Write-Host "ERROR: EndDate must be yyyy-MM-dd format." -ForegroundColor Red; exit 1 } + } + elseif (-not $PSBoundParameters.ContainsKey('EndDate')) { + $EndDate = '*' + try { + $parsedStart = [datetime]::ParseExact($StartDate, 'yyyy-MM-dd', $null) + } catch { Write-Host "ERROR: StartDate must be yyyy-MM-dd format." -ForegroundColor Red; exit 1 } + } + else { + try { + $parsedStart = [datetime]::ParseExact($StartDate, 'yyyy-MM-dd', $null) + $parsedEnd = [datetime]::ParseExact($EndDate, 'yyyy-MM-dd', $null) + } + catch { Write-Host "ERROR: StartDate/EndDate must be in yyyy-MM-dd format." -ForegroundColor Red; exit 1 } + if ($parsedEnd -lt $parsedStart) { Write-Host "ERROR: EndDate ($EndDate) is earlier than StartDate ($StartDate)." -ForegroundColor Red; exit 1 } + } +} + +# Client-side date-range trim boundaries — Purview's partition-based indexing can +# return records outside the requested date range (observed up to ~10 h past EndDate). +# These UTC boundaries are used after dedup to trim any out-of-range records. +# SpecifyKind(Utc) is critical: ParseExact returns Kind=Unspecified, and .ToUniversalTime() +# on Unspecified assumes LOCAL time, shifting the boundary by the machine's UTC offset. +$script:TrimStartDateUTC = if ($StartDate -ne '*') { [datetime]::SpecifyKind([datetime]::ParseExact($StartDate, 'yyyy-MM-dd', $null), [System.DateTimeKind]::Utc) } else { $null } +$script:TrimEndDateUTC = if ($EndDate -ne '*') { [datetime]::SpecifyKind([datetime]::ParseExact($EndDate, 'yyyy-MM-dd', $null), [System.DateTimeKind]::Utc) } else { $null } +$script:DateTrimCount = 0 + +if ($BlockHours -le 0) { Write-Host "ERROR: BlockHours must be positive." -ForegroundColor Red; exit 1 } + +try { if ($PSVersionTable.PSEdition -eq 'Core' -and ($global:InformationPreference -in @('SilentlyContinue', 'Ignore'))) { $global:InformationPreference = 'Continue' } } catch {} + +if ($RAWInputCSV) { + $rawConflictParams = @('BlockHours', 'ResultSize', 'PacingMs', 'Auth', 'ParallelMode', 'MaxParallelGroups', 'MaxConcurrency', 'EnableParallel', 'GroupNames') + $specifiedConflicts = @() + foreach ($cp in $rawConflictParams) { if ($PSBoundParameters.ContainsKey($cp)) { $specifiedConflicts += $cp } } + if ($specifiedConflicts.Count -gt 0) { + Write-Host "ERROR: -RAWInputCSV cannot be combined with live query parameter(s): $($specifiedConflicts -join ', ')" -ForegroundColor Red + Write-Host "Remove those conflicting parameters and re-run. Allowed with RAWInputCSV: StartDate, EndDate, ActivityTypes, AgentId, AgentsOnly, UserIds, OutputFile, AppendFile, explosion switches." -ForegroundColor Yellow + Write-Host "Note: -GroupNames requires authentication and cannot be used in replay mode. Use -UserIds with explicit email addresses instead." -ForegroundColor Yellow + exit 1 + } +} + +# Validate -UseEOM compatibility with parallel processing +if ($UseEOM) { + $parallelConflicts = @() + + # Check for explicit parallel mode settings + if ($PSBoundParameters.ContainsKey('EnableParallel') -and $EnableParallel) { + $parallelConflicts += '-EnableParallel' + } + + if ($PSBoundParameters.ContainsKey('ParallelMode') -and $ParallelMode -ne 'Off') { + $parallelConflicts += "-ParallelMode $ParallelMode" + } + + if ($parallelConflicts.Count -gt 0) { + Write-Host "" + Write-Host "════════════════════════════════════════════════════════════════════" -ForegroundColor Red + Write-Host " ERROR: -UseEOM Incompatible with Parallel Processing" -ForegroundColor Red + Write-Host "════════════════════════════════════════════════════════════════════" -ForegroundColor Red + Write-Host "" + Write-Host "Exchange Online Management mode (-UseEOM) only supports SERIAL processing." -ForegroundColor Yellow + Write-Host "The Search-UnifiedAuditLog cmdlet cannot be used in parallel ThreadJobs due to" -ForegroundColor Gray + Write-Host "implicit remoting architecture limitations in the EOM PowerShell module." -ForegroundColor Gray + Write-Host "" + Write-Host "CONFLICTING PARAMETERS DETECTED:" -ForegroundColor Yellow + foreach ($conflict in $parallelConflicts) { + Write-Host " • $conflict" -ForegroundColor Red + } + Write-Host "" + Write-Host "RESOLUTION OPTIONS:" -ForegroundColor Cyan + Write-Host " 1. Remove -UseEOM switch to enable Graph API mode (supports parallel processing)" -ForegroundColor White + Write-Host " 2. Remove parallel parameters and use serial-only processing with -UseEOM" -ForegroundColor White + Write-Host " 3. Set -ParallelMode Off explicitly: -UseEOM -ParallelMode Off" -ForegroundColor White + Write-Host "" + Write-Host "NOTE: Graph API mode (default, no -UseEOM) supports parallel processing in PowerShell 7+." -ForegroundColor Gray + Write-Host "" + Write-Host "════════════════════════════════════════════════════════════════════" -ForegroundColor Red + + # Log to file if log initialized + if ($script:logFile -and (Test-Path $script:logFile)) { + $timestamp = Get-Date -Format 'yyyy-MM-dd HH:mm:ss' + Add-Content -Path $script:logFile -Value "[$timestamp] ERROR: -UseEOM incompatible with parallel processing parameters: $($parallelConflicts -join ', ')" + Add-Content -Path $script:logFile -Value "[$timestamp] Script terminated. Resolution: Remove -UseEOM or disable parallel mode." + } + + exit 1 + } + + # Force ParallelMode Off in EOM mode even if Auto is set + if ($ParallelMode -ne 'Off') { + Write-Host "" + Write-Host "NOTE: -UseEOM mode requires serial processing. Forcing -ParallelMode Off." -ForegroundColor Yellow + Write-Host "" + $ParallelMode = 'Off' + } +} + +$script:learnedActivityBlockSize = @{} +$script:globalLearnedBlockSize = $BlockHours +$script:subdivisionSequence = @(0.5, 0.25, 0.133333, 0.066667, 0.033333, 0.016667, 0.010417, 0.005556, 0.002778, 0.001389) # 12h, 6h, 3.2h, 1.6h, 48m, 24m, 15m, 8m, 4m, 2m +$script:Hit10KLimit = $false +$script:Hit1MLimit = $false # Graph API 1,000,000 record limit per query +$script:LimitTimeWindow = "" +$script:SubdividedPartitions = @{} # Track partitions that needed subdivision (key=original range, value=count) +$script:Connected = $false + +# ============================================================================ +# GRAPH API SECURITY AUDIT ENDPOINT VERSION CONFIGURATION +# ============================================================================ +# Manually configure these variables if Microsoft updates the API version +# PAX will try CURRENT version first, then fallback to PREVIOUS version +# ============================================================================ +$script:GraphAuditApiVersion_Current = 'v1.0' # Try this version first (expected GA in Q1 2026) +$script:GraphAuditApiVersion_Previous = 'beta' # Fallback to this version if current unavailable +$script:GraphAuditApiVersion = $null # Runtime-detected version (do not edit) +# ============================================================================ + +# Suppress PowerShell's web request progress bar (prevents "Reading web response stream" noise) +$ProgressPreference = 'SilentlyContinue' + +# Telemetry tracking for Graph API parallel queries (per-slice lifecycle data) +$script:telemetryData = @() + +$script:metrics = @{ + StartTime = (Get-Date).ToUniversalTime() + QueryMs = 0 + ExplosionMs = 0 + ExportMs = 0 + PagesFetched = 0 + TotalRecordsFetched = 0 + TotalStructuredRows = 0 + ExplosionEvents = 0 + ExplosionRowsFromEvents = 0 + ExplosionMaxPerRecord = 0 + ExplosionTruncated = $false + ShrinkEvents = 0 + Activities = @{} + EffectiveChunkSize = 0 + ParallelBatchSizeFinal = 0 + ParallelThrottleFinal = 0 + AgentFilterApplied = $false + AgentFilterPreCount = 0 + AgentFilterPostCount = 0 + AgentFilterRemovedCount = 0 + AgentFilterElapsedSec = 0 + ExcludeAgentsApplied = $false + ExcludeAgentsPreCount = 0 + ExcludeAgentsPostCount = 0 + ExcludeAgentsRemoved = 0 + ExcludeAgentsElapsedSec = 0 + PromptFilterApplied = $false + PromptFilterType = '' + PromptFilterPreCount = 0 + PromptFilterPostCount = 0 + PromptFilterRemovedCount = 0 + PromptFilterElapsedSec = 0 + PromptFilterMsgBefore = 0 + PromptFilterMsgAfter = 0 + PromptFilterMsgRemoved = 0 + PromptFilterRecordsMixed = 0 + PromptFilterRecordsPromptOnly = 0 + PromptFilterRecordsResponseOnly = 0 + PromptFilterRecordsNoMessages = 0 + FilteringSkippedRecords = 0 + FilteringMissingAuditData = 0 + FilteringParseFailures = 0 + FilteringPromptFiltered = 0 + FilteringAgentFiltered = 0 + FilteringExcludeAgents = 0 + FilteringUserIds = 0 + FilteringGroupNames = 0 + FilteringOther = 0 + AdaptiveEvents = @() + AdaptiveMemoryReductions = 0 + AdaptiveLatencyReductions = 0 + AdaptiveLatencyIncreases = 0 + ThroughputBaselineRps = 0 + CircuitBreakerTrips = 0 + BackoffTotalDelaySeconds = 0 + PartitionCapsApplied = 0 + PartitionCapHighestRequested = 0 +} + +$script:summaryWritten = $false + +# Streaming dataset profiler (live & replay) +$script:profiler = @{ + Rows = 0 + Operations = @{} + RecordTypes = @{} + HasCopilot = 0 + MaxDepth = 0 + DepthCounts = @{} + MaxArrayLen = 0 +} + +$script:shapeCache = @{} + +function Get-RecordShapeKey { + param([object]$AuditData) + try { + $rt = $AuditData.RecordType + } catch { $rt = '' } + try { + $op = $AuditData.Operation + } catch { $op = '' } + try { + $hasCopilot = $AuditData.PSObject.Properties['CopilotEventData'] -ne $null + } catch { $hasCopilot = $false } + return "$rt|$op|$hasCopilot" +} + +function Get-RecordShape { + param([object]$AuditData) + if ($null -eq $AuditData) { return $null } + $key = Get-RecordShapeKey $AuditData + if ($script:shapeCache.ContainsKey($key)) { return $script:shapeCache[$key] } + $shape = @{} + try { + $shape.RecordType = $AuditData.RecordType + $shape.Operation = $AuditData.Operation + } catch {} + try { $shape.HasCopilot = $AuditData.PSObject.Properties['CopilotEventData'] -ne $null } catch { $shape.HasCopilot = $false } + try { $shape.Depth = Get-JsonDepth $AuditData 0 } catch { $shape.Depth = 0 } + $shape.Mode = if ($shape.HasCopilot) { 'Copilot' } else { 'AuditData' } + $script:shapeCache[$key] = $shape + return $shape +} + +function Reset-Profiler { + $script:profiler = @{ + Rows = 0 + Operations = @{} + RecordTypes = @{} + HasCopilot = 0 + MaxDepth = 0 + DepthCounts = @{} + MaxArrayLen = 0 + } +} + +function Get-JsonDepth([object]$node, [int]$d = 0) { + if ($null -eq $node -or (Test-ScalarValue $node)) { return $d } + if ($node -is [System.Collections.IDictionary]) { + $maxd = $d + foreach ($v in $node.Values) { $maxd = [math]::Max($maxd, (Get-JsonDepth $v ($d + 1))) } + return $maxd + } + if ($node -is [System.Collections.IEnumerable] -and -not ($node -is [string])) { + $maxd = $d + $i = 0 + foreach ($el in $node) { $maxd = [math]::Max($maxd, (Get-JsonDepth $el ($d + 1))); $i++ } + if ($i -gt $script:profiler.MaxArrayLen) { $script:profiler.MaxArrayLen = $i } + return $maxd + } + return $d +} + +function Profile-AuditData { + param([object]$AuditData) + if ($null -eq $AuditData) { return } + try { + $script:profiler.Rows++ + # Operation + try { + $op = $AuditData.Operation + if (-not [string]::IsNullOrWhiteSpace($op)) { + if (-not $script:profiler.Operations.ContainsKey($op)) { $script:profiler.Operations[$op] = 0 } + $script:profiler.Operations[$op] += 1 + } + } catch {} + # RecordType + try { + $rt = $AuditData.RecordType + if (-not [string]::IsNullOrWhiteSpace([string]$rt)) { + if (-not $script:profiler.RecordTypes.ContainsKey([string]$rt)) { $script:profiler.RecordTypes[[string]$rt] = 0 } + $script:profiler.RecordTypes[[string]$rt] += 1 + } + } catch {} + # CopilotEventData presence + try { if ($AuditData.PSObject.Properties['CopilotEventData']) { $script:profiler.HasCopilot++ } } catch {} + # Depth & arrays + $depth = Get-JsonDepth $AuditData 0 + if ($depth -gt $script:profiler.MaxDepth) { $script:profiler.MaxDepth = $depth } + if (-not $script:profiler.DepthCounts.ContainsKey($depth)) { $script:profiler.DepthCounts[$depth] = 0 } + $script:profiler.DepthCounts[$depth] += 1 + } catch {} +} + +function Write-ProfilerSummary { + param([int]$TopOps = 20, [int]$TopDepths = 10) + try { + Write-LogHost "Profiler: Rows=$($script:profiler.Rows), MaxDepth=$($script:profiler.MaxDepth), MaxArrayLen=$($script:profiler.MaxArrayLen), HasCopilot=$($script:profiler.HasCopilot)" -ForegroundColor Gray + if ($script:profiler.Operations.Count -gt 0) { + Write-LogHost "Profiler: Operations (top $TopOps):" -ForegroundColor Gray + $script:profiler.Operations.GetEnumerator() | Sort-Object Value -Descending | Select-Object -First $TopOps | ForEach-Object { Write-LogHost " $($_.Key): $($_.Value)" -ForegroundColor Gray } + } + if ($script:profiler.DepthCounts.Count -gt 0) { + Write-LogHost "Profiler: Depth distribution (top $TopDepths):" -ForegroundColor Gray + $script:profiler.DepthCounts.GetEnumerator() | Sort-Object Value -Descending | Select-Object -First $TopDepths | ForEach-Object { Write-LogHost " Depth $($_.Key): $($_.Value)" -ForegroundColor Gray } + } + } catch {} +} +$script:adaptiveThroughputBaseline = $null +$script:adaptiveLowLatencyStreak = 0 +$script:consecutiveBlockFailures = 0 +$script:circuitBreakerOpen = $false +$script:circuitBreakerOpenUntil = $null + +# ============================================== +# GRAPH API VERSION DETECTION HELPER +# ============================================== +# Automatically detects and uses configured current version or falls back to previous version +# Version configuration is at top of script for easy manual updates + +function Get-GraphAuditApiUri { + <# + .SYNOPSIS + Builds Graph API audit endpoint URI with automatic version detection. + + .DESCRIPTION + Attempts to use the configured current version first. If not available, + falls back to the previous version. Version detection is cached per session. + + Configure versions at top of script: + $script:GraphAuditApiVersion_Current = 'v1.0' (try first) + $script:GraphAuditApiVersion_Previous = 'beta' (fallback) + + .PARAMETER Path + The audit API path (e.g., "queries", "queries/{id}", "queries/{id}/records") + + .OUTPUTS + String - Full Graph API URI with appropriate version + + .EXAMPLE + $uri = Get-GraphAuditApiUri -Path "queries" + # Returns: https://graph.microsoft.com/v1.0/security/auditLog/queries + # or: https://graph.microsoft.com/beta/security/auditLog/queries (if v1.0 unavailable) + #> + param( + [Parameter(Mandatory = $true)] + [string]$Path + ) + + # Auto-detect version on first use (cached for session) + if ($null -eq $script:GraphAuditApiVersion) { + $currentVer = $script:GraphAuditApiVersion_Current + $previousVer = $script:GraphAuditApiVersion_Previous + + try { + # Test if current version endpoint is available + $testUri = "https://graph.microsoft.com/$currentVer/security/auditLog/queries" + Invoke-MgGraphRequest -Method GET -Uri $testUri -ErrorAction Stop | Out-Null + $script:GraphAuditApiVersion = $currentVer + Write-LogHost "Graph API: security/auditLog endpoint using version $currentVer" -ForegroundColor Green + } catch { + # Current version not available, fallback to previous + $script:GraphAuditApiVersion = $previousVer + Write-LogHost "Graph API: security/auditLog endpoint using version $previousVer (fallback from $currentVer)" -ForegroundColor Yellow + } + } + + return "https://graph.microsoft.com/$($script:GraphAuditApiVersion)/security/auditLog/$Path" +} + +# ============================================== +# CTRL+C GRACEFUL EXIT HANDLER +# ============================================== +# Track Ctrl+C state for graceful exit messaging in finally block + +$script:CtrlCPressed = $false +$script:ScriptCompleted = $false +$script:EarlyExit = $false + +# Register exit handler that ALWAYS runs when PowerShell exits +# This works even when Ctrl+C is pressed before the try block (e.g., during module loading) +# Uses environment variable for cross-runspace communication since Register-EngineEvent runs in isolated scope +$env:PAX_GRACEFUL_EXIT_DONE = $null +$env:PAX_REPLAY_MODE = $null # Will be set to "1" when RAWInputCSV is used +Register-EngineEvent -SourceIdentifier PowerShell.Exiting -Action { + if (-not $env:PAX_GRACEFUL_EXIT_DONE) { + # Skip interrupt messaging in replay mode - no Graph connection to disconnect + if (-not $env:PAX_REPLAY_MODE) { + Write-Host "" + Write-Host "============================================================================================================" -ForegroundColor Yellow + Write-Host " Script Interrupted - Performing Graceful Cleanup" -ForegroundColor Yellow + Write-Host "============================================================================================================" -ForegroundColor Yellow + Write-Host "" + Write-Host " Cleanup complete. Exiting..." -ForegroundColor Green + Write-Host "" + } + } +} | Out-Null + +# Define cleanup function (used by catch block for PipelineStoppedException) +function Invoke-GracefulExit { + param([string]$Reason = "Script interrupted") + + if ($script:CtrlCPressed) { return } # Prevent multiple invocations + $script:CtrlCPressed = $true + + # Signal to engine event handler that graceful exit is handling this + $env:PAX_GRACEFUL_EXIT_DONE = "1" + + # Skip interrupt messaging and Graph disconnect in replay mode - no connections to clean up + if ($env:PAX_REPLAY_MODE) { + exit 0 + } + + Write-Host "" + Write-Host "============================================================================================================" -ForegroundColor Yellow + Write-Host " Script Interrupted - Performing Graceful Cleanup" -ForegroundColor Yellow + Write-Host "============================================================================================================" -ForegroundColor Yellow + Write-Host "" + + # Disconnect from Microsoft Graph - ALWAYS attempt disconnect + Write-Host " Disconnecting from Microsoft Graph..." -ForegroundColor Cyan + try { + Disconnect-MgGraph -ErrorAction Stop | Out-Null + Write-Host " Microsoft Graph disconnected" -ForegroundColor Green + } + catch { + if ($_.Exception.Message -match 'No application to sign out from') { + Write-Host " (Not connected to Microsoft Graph)" -ForegroundColor DarkGray + } else { + Write-Host " Microsoft Graph session cleared" -ForegroundColor Green + } + } + + # Disconnect from Exchange Online (if connected via EOM mode) + try { + $eomSession = Get-PSSession | Where-Object { $_.ConfigurationName -eq 'Microsoft.Exchange' -and $_.State -eq 'Opened' } + if ($eomSession) { + Write-Host " Disconnecting from Exchange Online Management..." -ForegroundColor Cyan + Disconnect-ExchangeOnline -Confirm:$false -ErrorAction SilentlyContinue | Out-Null + Write-Host " Exchange Online disconnected" -ForegroundColor Green + } + } + catch { + Write-Host " (Exchange Online cleanup completed)" -ForegroundColor Gray + } + + # Log the graceful exit + if ($LogFile -and (Test-Path $LogFile)) { + Write-Output "" | Out-File -FilePath $LogFile -Append -Encoding utf8 + Write-Output "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] Script interrupted by user (Ctrl+C)" | Out-File -FilePath $LogFile -Append -Encoding utf8 + Write-Output "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] Graceful cleanup completed" | Out-File -FilePath $LogFile -Append -Encoding utf8 + } + + # Show checkpoint resume message if checkpoint is enabled + if ($script:CheckpointEnabled -and $script:CheckpointPath -and (Test-Path $script:CheckpointPath)) { + Show-CheckpointExitMessage + } + + Write-Host "" + Write-Host " Cleanup complete. Exiting..." -ForegroundColor Green + Write-Host "" + + # Exit cleanly (env var PAX_GRACEFUL_EXIT_DONE already set at function start) + exit 0 +} + +# Trap for catching terminating errors (including Ctrl+C) +trap { + if ($_.Exception -is [System.Management.Automation.PipelineStoppedException]) { + Invoke-GracefulExit + break + } + # Re-throw other exceptions + throw $_ +} + +# ============================================== +# MODULE PREREQUISITES +# ============================================== +# Load required modules based on mode selection (-UseEOM vs Graph API default) + +if ($RAWInputCSV) { + # Set replay mode flag for graceful exit handling (skip Graph disconnect messaging) + $env:PAX_REPLAY_MODE = "1" + Write-LogHost "`nReplay mode: Skipping module loading`n" -ForegroundColor Cyan +} +elseif (-not $UseEOM) { + # DEFAULT MODE: Microsoft Graph Security API + # Requires Microsoft.Graph.Authentication and Microsoft.Graph.Security modules + Write-LogHost "`nLoading Microsoft Graph modules..." -ForegroundColor Cyan + + try { + # ============================================ + # AUTO-UPDATE CHECK: Ensure latest SDK version + # ============================================ + # The Graph Security auditLog API has known issues with older SDK versions. + # Always check for and install the latest version to ensure compatibility. + + Write-LogHost " Checking for Microsoft Graph SDK updates..." -ForegroundColor Gray + + # Get currently installed version + $installedAuth = Get-Module -ListAvailable -Name Microsoft.Graph.Authentication | Sort-Object Version -Descending | Select-Object -First 1 + $installedAuthVersion = if ($installedAuth) { $installedAuth.Version } else { [Version]"0.0.0" } + + # Check PSGallery for latest version (with 15-second timeout to avoid hangs) + $latestAuthVersion = $null + try { + $updateCheckJob = Start-Job -ScriptBlock { Find-Module -Name Microsoft.Graph.Authentication -Repository PSGallery -ErrorAction Stop } + $jobCompleted = Wait-Job -Job $updateCheckJob -Timeout 15 + if ($jobCompleted) { + $galleryAuth = Receive-Job -Job $updateCheckJob -ErrorAction Stop + $latestAuthVersion = [Version]$galleryAuth.Version + } + else { + Write-LogHost " PSGallery check timed out (15s) - skipping update check" -ForegroundColor Yellow + } + Remove-Job -Job $updateCheckJob -Force -ErrorAction SilentlyContinue + } + catch { + Write-LogHost " Warning: Could not check PSGallery for updates: $($_.Exception.Message)" -ForegroundColor Yellow + Write-LogHost " Continuing with installed version..." -ForegroundColor Yellow + } + + # Update if newer version available + $updatePerformed = $false + if ($latestAuthVersion -and ($latestAuthVersion -gt $installedAuthVersion)) { + Write-LogHost " Update available: v$installedAuthVersion → v$latestAuthVersion" -ForegroundColor Yellow + Write-LogHost " Installing Microsoft.Graph.Authentication v$latestAuthVersion..." -ForegroundColor Yellow + + try { + # Install latest Authentication module + Install-Module -Name Microsoft.Graph.Authentication -Scope CurrentUser -Force -AllowClobber -ErrorAction Stop + Write-LogHost " Microsoft.Graph.Authentication updated to v$latestAuthVersion" -ForegroundColor Green + + # Install matching Security module + Write-LogHost " Installing Microsoft.Graph.Security v$latestAuthVersion..." -ForegroundColor Yellow + Install-Module -Name Microsoft.Graph.Security -RequiredVersion $latestAuthVersion -Scope CurrentUser -Force -AllowClobber -ErrorAction Stop + Write-LogHost " Microsoft.Graph.Security updated to v$latestAuthVersion" -ForegroundColor Green + + $updatePerformed = $true + + # Refresh module info after update + $installedAuth = Get-Module -ListAvailable -Name Microsoft.Graph.Authentication | Sort-Object Version -Descending | Select-Object -First 1 + $installedAuthVersion = $installedAuth.Version + } + catch { + Write-LogHost " Warning: Update failed: $($_.Exception.Message)" -ForegroundColor Yellow + Write-LogHost " Continuing with existing version v$installedAuthVersion..." -ForegroundColor Yellow + } + } + elseif ($latestAuthVersion) { + Write-LogHost " Microsoft Graph SDK is up to date (v$installedAuthVersion)" -ForegroundColor Green + } + else { + Write-LogHost " Using installed version v$installedAuthVersion" -ForegroundColor Gray + } + + # ============================================ + # LOAD MODULES + # ============================================ + + $authModule = Get-Module -Name Microsoft.Graph.Authentication | Select-Object -First 1 + if (-not $authModule) { + $authModule = Get-Module -ListAvailable -Name Microsoft.Graph.Authentication | Sort-Object Version -Descending | Select-Object -First 1 + } + if (-not $authModule) { + Write-LogHost " Installing Microsoft.Graph.Authentication module (CurrentUser scope)..." -ForegroundColor Yellow + Install-Module -Name Microsoft.Graph.Authentication -Scope CurrentUser -Force -AllowClobber -ErrorAction Stop + $authModule = Get-Module -ListAvailable -Name Microsoft.Graph.Authentication | Sort-Object Version -Descending | Select-Object -First 1 + } + $authVersion = $authModule.Version + Write-LogHost " Importing Microsoft.Graph.Authentication v$authVersion..." -ForegroundColor Gray + Import-Module Microsoft.Graph.Authentication -RequiredVersion $authVersion -Force -ErrorAction Stop + Write-LogHost " Microsoft.Graph.Authentication v$authVersion loaded" -ForegroundColor Green + + # Load Microsoft.Graph.Security matching auth version (exact if possible, otherwise same major/minor) + $securityModule = Get-Module -Name Microsoft.Graph.Security | Where-Object { $_.Version -eq $authVersion } | Select-Object -First 1 + if (-not $securityModule) { + $securityModule = Get-Module -ListAvailable -Name Microsoft.Graph.Security | + Where-Object { $_.Version.Major -eq $authVersion.Major -and $_.Version.Minor -eq $authVersion.Minor } | + Sort-Object Version -Descending | + Select-Object -First 1 + } + if (-not $securityModule) { + Write-LogHost " Installing Microsoft.Graph.Security v$authVersion (CurrentUser scope)..." -ForegroundColor Yellow + Install-Module -Name Microsoft.Graph.Security -RequiredVersion $authVersion -Scope CurrentUser -Force -AllowClobber -ErrorAction Stop + $securityModule = Get-Module -ListAvailable -Name Microsoft.Graph.Security | Where-Object { $_.Version -eq $authVersion } | Select-Object -First 1 + } + $secVersion = $securityModule.Version + Write-LogHost " Importing Microsoft.Graph.Security v$secVersion..." -ForegroundColor Gray + Import-Module Microsoft.Graph.Security -RequiredVersion $secVersion -Force -ErrorAction Stop + Write-LogHost " Microsoft.Graph.Security v$secVersion loaded" -ForegroundColor Green + } + catch { + Write-LogHost " ERROR: Failed to load Microsoft Graph module: $($_.Exception.Message)" -ForegroundColor Red + Write-LogHost "`nTroubleshooting:" -ForegroundColor Yellow + Write-LogHost " 1. Ensure PowerShell Gallery access is available" -ForegroundColor White + Write-LogHost " 2. Try manual installation: Install-Module -Name Microsoft.Graph -Force" -ForegroundColor White + Write-LogHost " 3. Use -UseEOM switch to fall back to Exchange Online Management mode" -ForegroundColor White + throw + } + + Write-LogHost "Microsoft Graph modules loaded successfully`n" -ForegroundColor Green +} +else { + # EOM MODE: Exchange Online Management + # Graph modules not required in EOM mode + Write-LogHost "`nEOM Mode: Skipping Microsoft Graph module loading`n" -ForegroundColor Cyan +} + +# ============================================== +# DUAL-MODE AUTHENTICATION FUNCTION +# ============================================== +# Unified authentication supporting both EOM and Graph API modes + +function Connect-PurviewAudit { + <# + .SYNOPSIS + Unified authentication for Purview audit log access via EOM or Graph API. + + .DESCRIPTION + Authenticates to Microsoft 365 using either Exchange Online Management (EOM) + or Microsoft Graph Security API based on the -UseEOM switch. + + EOM Mode (-UseEOM): + - Uses Connect-ExchangeOnline cmdlet + - Requires Exchange Online RBAC roles + - Serial processing only + + Graph API Mode (Default): + - Uses Connect-MgGraph with AuditLog.Read.All scope + - Requires Azure AD roles + Graph API permissions + - Supports parallel processing + + .PARAMETER AuthMethod + Authentication method: WebLogin, DeviceCode, Credential, Silent, AppRegistration + + .PARAMETER UseEOMMode + If true, use EOM mode. If false, use Graph API mode. + #> + + param( + [Parameter(Mandatory = $true)] + [ValidateSet('WebLogin', 'DeviceCode', 'Credential', 'Silent', 'AppRegistration')] + [string]$AuthMethod, + + [Parameter(Mandatory = $false)] + [bool]$UseEOMMode = $false + ) + + if ($UseEOMMode) { + # ======================================== + # EOM MODE: Exchange Online Management + # ======================================== + + if ($script:Connected) { + Write-LogHost "Already connected to Exchange Online." -ForegroundColor Gray + return + } + + Write-LogHost "Connecting to Microsoft 365 Security & Compliance Center (EOM)..." -ForegroundColor Cyan + + # Ensure ExchangeOnlineManagement module is available + try { + $existingEOM = Get-Module -ListAvailable -Name ExchangeOnlineManagement | Sort-Object Version -Descending | Select-Object -First 1 + if (-not $existingEOM) { + Write-LogHost "Installing ExchangeOnlineManagement module (CurrentUser scope)..." -ForegroundColor Yellow + Install-Module -Name ExchangeOnlineManagement -Scope CurrentUser -Force -AllowClobber -ErrorAction Stop + } + Import-Module ExchangeOnlineManagement -Force -ErrorAction Stop + + $eomVersion = (Get-Module ExchangeOnlineManagement).Version + Write-LogHost " ExchangeOnlineManagement v$eomVersion loaded" -ForegroundColor Green + } + catch { + Write-LogHost "ERROR: Module load/install failure: $($_.Exception.Message)" -ForegroundColor Red + throw + } + + # Authenticate based on method + try { + switch ($AuthMethod.ToLower()) { + 'appregistration' { + Write-LogHost "AppRegistration authentication is not supported with -UseEOM. Remove -UseEOM to use Graph mode." -ForegroundColor Yellow + throw "AppRegistration authentication is only available in Graph API mode" + } + 'weblogin' { + $exoCmd = Get-Command Connect-ExchangeOnline -ErrorAction Stop + $hasUseWeb = $exoCmd.Parameters.ContainsKey('UseWebLogin') + + if ($hasUseWeb) { + Write-LogHost "Using Connect-ExchangeOnline -UseWebLogin..." -ForegroundColor Gray + Connect-ExchangeOnline -ShowBanner:$false -UseWebLogin -ErrorAction Stop | Out-Null + } + else { + Write-LogHost "UseWebLogin parameter not available; using standard interactive auth..." -ForegroundColor Yellow + Connect-ExchangeOnline -ShowBanner:$false -ErrorAction Stop | Out-Null + } + } + + 'devicecode' { + Write-LogHost "Using device code flow..." -ForegroundColor Gray + Connect-ExchangeOnline -ShowBanner:$false -Device -ErrorAction Stop | Out-Null + } + + 'credential' { + Write-LogHost "Using credential-based authentication..." -ForegroundColor Gray + $cred = Get-Credential -Message 'Enter admin credentials for Exchange Online' + Connect-ExchangeOnline -ShowBanner:$false -Credential $cred -ErrorAction Stop | Out-Null + } + + 'silent' { + Write-LogHost "Attempting silent authentication..." -ForegroundColor Gray + $silentOk = $true + try { + Connect-ExchangeOnline -ShowBanner:$false -ErrorAction Stop | Out-Null + } + catch { + $silentOk = $false + } + + if (-not $silentOk) { + Write-LogHost "Silent auth failed, falling back to WebLogin..." -ForegroundColor Yellow + try { + Connect-ExchangeOnline -ShowBanner:$false -UseWebLogin -ErrorAction Stop | Out-Null + } + catch { + Write-LogHost "ERROR: Silent + fallback auth failed: $($_.Exception.Message)" -ForegroundColor Red + throw + } + } + } + } + + $script:Connected = $true + Write-LogHost "Successfully connected to Exchange Online" -ForegroundColor Green + + # Verify connection + try { + $connInfo = Get-ConnectionInformation -ErrorAction SilentlyContinue | Where-Object { $_.TokenStatus -ne 'Expired' } | Select-Object -First 1 + if ($connInfo) { + Write-LogHost " Tenant ID: $($connInfo.TenantId)" -ForegroundColor Gray + Write-LogHost " User: $($connInfo.UserPrincipalName)" -ForegroundColor Gray + } + } + catch { + # Connection info not critical, continue + } + } + catch { + Write-LogHost "ERROR: EOM authentication failed: $($_.Exception.Message)" -ForegroundColor Red + Write-LogHost "`nTroubleshooting:" -ForegroundColor Yellow + Write-LogHost " 1. Verify you have required Exchange Online roles" -ForegroundColor White + Write-LogHost " 2. Check Multi-Factor Authentication requirements" -ForegroundColor White + Write-LogHost " 3. Try a different auth method (-Auth parameter)" -ForegroundColor White + throw + } + } + else { + # ======================================== + # GRAPH API MODE: Microsoft Graph Security + # ======================================== + + # Clear any stale Graph session from a previous script run or Ctrl+C in this terminal. + # This forces a fresh Connect-MgGraph with a new token, preventing issues where MSAL + # silently returns a cached expired token or a token from a different user account. + Write-LogHost "Clearing any previous Graph session..." -ForegroundColor Gray + try { Disconnect-MgGraph -ErrorAction SilentlyContinue | Out-Null } catch { } + + Write-LogHost "Connecting to Microsoft Graph Security API..." -ForegroundColor Cyan + + # Define required scopes for Purview audit log access via beta endpoint + # ThreatIntelligence.Read.All is required for GET operations on beta endpoint + # Service-specific AuditLogsQuery-*.Read.All permissions are required for record retrieval + $RequiredScopes = @( + 'AuditLog.Read.All' # Primary scope for audit log queries + 'ThreatIntelligence.Read.All' # Required for GET operations (beta API) + 'AuditLogsQuery-Entra.Read.All' # Entra ID (Azure AD) audit logs + 'AuditLogsQuery-Exchange.Read.All' # Exchange Online audit logs + 'AuditLogsQuery-OneDrive.Read.All' # OneDrive audit logs + 'AuditLogsQuery-SharePoint.Read.All' # SharePoint Online audit logs + 'Organization.Read.All' # Required for tenant-level metadata (subscribedSkus, license fetch) + ) + + try { + switch ($AuthMethod.ToLower()) { + 'weblogin' { + Write-LogHost "Using interactive browser authentication..." -ForegroundColor Gray + Connect-MgGraph -Scopes $RequiredScopes -NoWelcome -ErrorAction Stop + } + + 'devicecode' { + Write-LogHost "Using device code flow..." -ForegroundColor Gray + Write-LogHost "A browser window will open. Follow the instructions to authenticate." -ForegroundColor Yellow + Connect-MgGraph -Scopes $RequiredScopes -UseDeviceCode -NoWelcome -ErrorAction Stop + } + + 'credential' { + Write-LogHost "Using client secret credential..." -ForegroundColor Gray + + # Check for required environment variables + $tenantId = $env:GRAPH_TENANT_ID + $clientId = $env:GRAPH_CLIENT_ID + $clientSecret = $env:GRAPH_CLIENT_SECRET + + if (-not $tenantId -or -not $clientId -or -not $clientSecret) { + Write-LogHost "ERROR: Credential authentication requires environment variables:" -ForegroundColor Red + Write-LogHost " GRAPH_TENANT_ID : Your Azure AD Tenant ID" -ForegroundColor Yellow + Write-LogHost " GRAPH_CLIENT_ID : Your App Registration Client ID" -ForegroundColor Yellow + Write-LogHost " GRAPH_CLIENT_SECRET : Your App Registration Client Secret" -ForegroundColor Yellow + Write-LogHost "" + Write-LogHost "Set these variables before running the script:" -ForegroundColor Yellow + Write-LogHost " `$env:GRAPH_TENANT_ID = 'your-tenant-id'" -ForegroundColor White + Write-LogHost " `$env:GRAPH_CLIENT_ID = 'your-client-id'" -ForegroundColor White + Write-LogHost " `$env:GRAPH_CLIENT_SECRET = 'your-client-secret'" -ForegroundColor White + throw "Missing required environment variables for credential authentication" + } + + $secureSecret = ConvertTo-SecureString -String $clientSecret -AsPlainText -Force + $credential = New-Object System.Management.Automation.PSCredential($clientId, $secureSecret) + + # Clear plain-text secret from memory + Clear-Variable -Name clientSecret -Force -ErrorAction SilentlyContinue + + Connect-MgGraph -TenantId $tenantId -ClientSecretCredential $credential -NoWelcome -ErrorAction Stop + } + + 'silent' { + Write-LogHost "Using managed identity or existing token..." -ForegroundColor Gray + Connect-MgGraph -Identity -NoWelcome -ErrorAction Stop + } + 'appregistration' { + Write-LogHost "Using app registration authentication..." -ForegroundColor Gray + + $appTenantId = $script:TenantId + if ([string]::IsNullOrWhiteSpace($appTenantId)) { $appTenantId = $env:GRAPH_TENANT_ID } + if ([string]::IsNullOrWhiteSpace($appTenantId)) { + Write-LogHost "ERROR: -TenantId or GRAPH_TENANT_ID is required for AppRegistration auth." -ForegroundColor Red + throw "Missing TenantId for AppRegistration authentication" + } + + $appClientId = $script:ClientId + if ([string]::IsNullOrWhiteSpace($appClientId)) { $appClientId = $env:GRAPH_CLIENT_ID } + if ([string]::IsNullOrWhiteSpace($appClientId)) { + Write-LogHost "ERROR: -ClientId or GRAPH_CLIENT_ID is required for AppRegistration auth." -ForegroundColor Red + throw "Missing ClientId for AppRegistration authentication" + } + + # Store auth config for potential re-authentication during long-running operations + $script:AuthConfig.Method = 'AppRegistration' + $script:AuthConfig.TenantId = $appTenantId + $script:AuthConfig.ClientId = $appClientId + $script:AuthConfig.CertStoreLocation = $script:ClientCertificateStoreLocation + + $secretValue = $script:ClientSecret + if ([string]::IsNullOrWhiteSpace($secretValue)) { $secretValue = $env:GRAPH_CLIENT_SECRET } + + $certThumbprint = $script:ClientCertificateThumbprint + if ([string]::IsNullOrWhiteSpace($certThumbprint)) { $certThumbprint = $env:GRAPH_CLIENT_CERT_THUMBPRINT } + + $certPath = $script:ClientCertificatePath + if ([string]::IsNullOrWhiteSpace($certPath)) { $certPath = $env:GRAPH_CLIENT_CERT_PATH } + + $certPasswordSecure = $script:ClientCertificatePassword + if (-not $certPasswordSecure -and $env:GRAPH_CLIENT_CERT_PASSWORD) { + $certPasswordSecure = ConvertTo-SecureString $env:GRAPH_CLIENT_CERT_PASSWORD -AsPlainText -Force + } + + $certPasswordPlain = $null + if ($certPasswordSecure) { + $certPasswordPlain = [System.Net.NetworkCredential]::new('', $certPasswordSecure).Password + } + + if (-not [string]::IsNullOrWhiteSpace($secretValue)) { + Write-LogHost " -> Authenticating with client secret" -ForegroundColor Gray + $secureSecret = ConvertTo-SecureString -String $secretValue -AsPlainText -Force + $credential = New-Object System.Management.Automation.PSCredential($appClientId, $secureSecret) + # Store secret securely for re-authentication (keep a copy before clearing) + $script:AuthConfig.ClientSecret = $secureSecret.Copy() + $script:AuthConfig.CanReauthenticate = $true + Clear-Variable -Name secretValue -Force -ErrorAction SilentlyContinue + Connect-MgGraph -TenantId $appTenantId -ClientSecretCredential $credential -NoWelcome -ErrorAction Stop + Clear-Variable -Name secureSecret -Force -ErrorAction SilentlyContinue + Clear-Variable -Name credential -Force -ErrorAction SilentlyContinue + } + elseif (-not [string]::IsNullOrWhiteSpace($certThumbprint)) { + Write-LogHost " -> Authenticating with certificate thumbprint $certThumbprint" -ForegroundColor Gray + $storeLocation = [System.Security.Cryptography.X509Certificates.StoreLocation]::$script:ClientCertificateStoreLocation + $store = New-Object System.Security.Cryptography.X509Certificates.X509Store("My", $storeLocation) + $store.Open([System.Security.Cryptography.X509Certificates.OpenFlags]::ReadOnly) + try { + $certificate = $store.Certificates | Where-Object { $_.Thumbprint -eq $certThumbprint } + if (-not $certificate) { + Write-LogHost "ERROR: Certificate with thumbprint '$certThumbprint' not found in $script:ClientCertificateStoreLocation store." -ForegroundColor Red + throw "Certificate not found" + } + # Store cert thumbprint for re-authentication + $script:AuthConfig.CertThumbprint = $certThumbprint + $script:AuthConfig.CanReauthenticate = $true + Connect-MgGraph -TenantId $appTenantId -ClientId $appClientId -CertificateThumbprint $certThumbprint -NoWelcome -ErrorAction Stop + } + finally { + $store.Close() + } + } + elseif (-not [string]::IsNullOrWhiteSpace($certPath)) { + Write-LogHost " -> Authenticating with certificate file $certPath" -ForegroundColor Gray + $flags = [System.Security.Cryptography.X509Certificates.X509KeyStorageFlags]::Exportable + $cert = $null + try { + if ($certPasswordPlain) { + $cert = New-Object System.Security.Cryptography.X509Certificates.X509Certificate2($certPath, $certPasswordPlain, $flags) + } + else { + $cert = New-Object System.Security.Cryptography.X509Certificates.X509Certificate2($certPath) + } + # Store cert path and password for re-authentication + $script:AuthConfig.CertPath = $certPath + if ($certPasswordSecure) { $script:AuthConfig.CertPassword = $certPasswordSecure.Copy() } + $script:AuthConfig.CanReauthenticate = $true + Connect-MgGraph -TenantId $appTenantId -ClientId $appClientId -Certificate $cert -NoWelcome -ErrorAction Stop + } + finally { + if ($cert) { $cert.Dispose() } + if ($certPasswordPlain) { + Clear-Variable -Name certPasswordPlain -Force -ErrorAction SilentlyContinue + } + } + } + else { + Write-LogHost "ERROR: Provide either -ClientSecret, -ClientCertificateThumbprint, or -ClientCertificatePath for AppRegistration auth." -ForegroundColor Red + throw "No credential material supplied for AppRegistration" + } + + if ($certPasswordSecure) { + Clear-Variable -Name certPasswordSecure -Force -ErrorAction SilentlyContinue + } + } + } + + Write-LogHost "Successfully connected to Microsoft Graph" -ForegroundColor Green + + # Record token issue time for proactive refresh tracking + $script:AuthConfig.TokenIssueTime = Get-Date + + # Initialize shared auth state for thread jobs (enables proactive token refresh) + $tokenInfo = Get-GraphAccessTokenWithExpiry + if ($tokenInfo) { + $script:SharedAuthState.Token = $tokenInfo.Token + $script:SharedAuthState.ExpiresOn = $tokenInfo.ExpiresOn + $script:SharedAuthState.LastRefresh = Get-Date + $script:SharedAuthState.AuthMethod = $AuthMethod.ToLower() + Write-LogHost " Token expires: $($tokenInfo.ExpiresOn.ToString('HH:mm:ss')) UTC (source: $($tokenInfo.Source))" -ForegroundColor Gray + } + + # Get and display current context + $context = Get-MgContext + Write-LogHost " Tenant ID: $($context.TenantId)" -ForegroundColor Gray + $maskedAccount = Get-MaskedUsername -Username $context.Account + Write-LogHost " Account: $maskedAccount" -ForegroundColor Gray + Write-LogHost " Scopes: $($context.Scopes -join ', ')" -ForegroundColor Gray + + # Trigger Graph API version detection early (before queries start) + $null = Get-GraphAuditApiUri -Path 'queries' + + # Validate required scopes are present + $missingScopes = @() + foreach ($scope in $RequiredScopes) { + if ($context.Scopes -notcontains $scope) { + $missingScopes += $scope + } + } + + if ($missingScopes.Count -gt 0) { + Write-LogHost "" + Write-LogHost "WARNING: Missing required scope(s):" -ForegroundColor Yellow + foreach ($scope in $missingScopes) { + Write-LogHost " • $scope" -ForegroundColor Yellow + } + Write-LogHost "" + Write-LogHost "Script may fail when accessing audit logs." -ForegroundColor Yellow + Write-LogHost "Consider re-authenticating with full permissions." -ForegroundColor Yellow + Write-LogHost "" + } + + $script:Connected = $true + } + catch { + Write-LogHost "ERROR: Graph API authentication failed: $($_.Exception.Message)" -ForegroundColor Red + Write-LogHost "" + Write-LogHost "Troubleshooting:" -ForegroundColor Yellow + Write-LogHost " 1. Ensure you have AuditLog.Read.All permission" -ForegroundColor White + Write-LogHost " 2. Verify Azure AD role (Compliance/Security Administrator)" -ForegroundColor White + Write-LogHost " 3. Check network connectivity to Microsoft Graph API" -ForegroundColor White + Write-LogHost " 4. Try a different authentication method (-Auth parameter)" -ForegroundColor White + Write-LogHost " 5. Use -UseEOM switch to fall back to EOM mode" -ForegroundColor White + Write-LogHost "" + throw + } + } +} + +# ============================================== +# ACCESS TOKEN EXTRACTION HELPER +# ============================================== +function Get-GraphAccessToken { + <# + .SYNOPSIS + Extracts the current access token from an active Microsoft Graph session. + + .DESCRIPTION + Microsoft Graph PowerShell SDK 2.x does NOT expose AccessToken via Get-MgContext + for security reasons. This function reliably extracts the token by making a + lightweight request and extracting the Authorization header. + + Primary method: HTTP request header extraction (reliable in SDK 2.x) + Fallback method: Get-MgContext.AccessToken (for older SDK versions) + + .OUTPUTS + [string] The access token, or $null if extraction fails + #> + [CmdletBinding()] + param() + + # Primary method: Extract token from HTTP response headers (reliable in SDK 2.x) + try { + $response = Invoke-MgGraphRequest -Method GET -Uri 'https://graph.microsoft.com/v1.0/$metadata' -OutputType HttpResponseMessage -ErrorAction Stop + $token = $response.RequestMessage.Headers.Authorization.Parameter + if ($token) { + return $token + } + } + catch { + # HTTP method failed, try fallback + } + + # Fallback method: Get-MgContext.AccessToken (works in older SDK versions) + try { + $context = Get-MgContext -ErrorAction SilentlyContinue + if ($context -and $context.AccessToken) { + return $context.AccessToken + } + } + catch { + # Fallback also failed + } + + return $null +} + +# ============================================== +# ACCESS TOKEN WITH EXPIRY EXTRACTION (for shared auth state) +# ============================================== +function Get-GraphAccessTokenWithExpiry { + <# + .SYNOPSIS + Extracts access token AND expiry time from the active Microsoft Graph session. + + .DESCRIPTION + Decodes the JWT access token to extract the actual 'exp' (expiry) claim. + This is more reliable than Azure.Identity reflection and doesn't cause + extra authentication popups. + + JWT tokens have three base64-encoded parts: header.payload.signature + The payload contains the 'exp' claim as a Unix timestamp. + + Falls back to 50-minute estimated expiry if JWT decode fails. + + .OUTPUTS + [hashtable] with Token (string) and ExpiresOn (DateTime) properties + Returns $null if no token can be extracted + #> + [CmdletBinding()] + param() + + $result = @{ + Token = $null + ExpiresOn = $null + Source = 'unknown' + } + + # First, get the token using existing reliable method + $result.Token = Get-GraphAccessToken + if (-not $result.Token) { + return $null + } + + # Try to decode JWT to get actual expiry from 'exp' claim + # JWT format: base64url(header).base64url(payload).signature + try { + $tokenParts = $result.Token.Split('.') + if ($tokenParts.Count -ge 2) { + # Decode the payload (second part) + $payloadBase64 = $tokenParts[1] + + # Add padding if needed (base64url uses no padding) + $paddingNeeded = 4 - ($payloadBase64.Length % 4) + if ($paddingNeeded -lt 4) { + $payloadBase64 += ('=' * $paddingNeeded) + } + + # Convert base64url to standard base64 (replace - with +, _ with /) + $payloadBase64 = $payloadBase64.Replace('-', '+').Replace('_', '/') + + # Decode and parse JSON + $payloadBytes = [Convert]::FromBase64String($payloadBase64) + $payloadJson = [System.Text.Encoding]::UTF8.GetString($payloadBytes) + $payload = $payloadJson | ConvertFrom-Json + + if ($payload.exp) { + # 'exp' is Unix timestamp (seconds since 1970-01-01 UTC) + $unixEpoch = [DateTime]::new(1970, 1, 1, 0, 0, 0, [DateTimeKind]::Utc) + $result.ExpiresOn = $unixEpoch.AddSeconds($payload.exp) + $result.Source = 'JWT' + + # Calculate time remaining for logging + $timeRemaining = $result.ExpiresOn - (Get-Date).ToUniversalTime() + if ($timeRemaining.TotalMinutes -gt 0) { + Write-Verbose "Token expires in $([int]$timeRemaining.TotalMinutes) minutes (from JWT 'exp' claim)" + } + + return $result + } + } + } + catch { + # JWT decode failed, use fallback + Write-Verbose "JWT decode failed: $($_.Exception.Message)" + } + + # Fallback: estimate 50-minute expiry from now (observed token lifetime ~45-60 minutes) + # With 5-minute buffer, proactive refresh triggers at ~45 min mark + $result.ExpiresOn = (Get-Date).ToUniversalTime().AddMinutes(50) + $result.Source = 'estimated' + + return $result +} + +# ============================================== +# TOKEN REFRESH FUNCTION FOR LONG-RUNNING OPERATIONS +# ============================================== +function Invoke-TokenRefresh { + <# + .SYNOPSIS + Forces re-authentication for AppRegistration auth mode to get fresh access token. + + .DESCRIPTION + When using App Registration authentication (client secret or certificate), + this function reconnects to Microsoft Graph to obtain a fresh access token. + This is critical for long-running operations that exceed the default OAuth + token lifetime (~60-90 minutes). + + For interactive auth modes, this function returns $false as re-authentication + would require user interaction. + + .PARAMETER Force + Force re-authentication even if token doesn't appear expired. + + .OUTPUTS + [PSCustomObject] with Success ($true/$false) and NewToken properties + #> + [CmdletBinding()] + param( + [switch]$Force + ) + + $result = [PSCustomObject]@{ + Success = $false + NewToken = $null + Message = "" + AuthMethod = $script:AuthConfig.Method + } + + # Check if we can re-authenticate + if (-not $script:AuthConfig.CanReauthenticate) { + $result.Message = "Auth method '$($script:AuthConfig.Method)' does not support automatic re-authentication" + return $result + } + + # Validate we have stored config + if ($script:AuthConfig.Method -ne 'AppRegistration') { + $result.Message = "Only AppRegistration auth mode supports automatic token refresh" + return $result + } + + if ([string]::IsNullOrWhiteSpace($script:AuthConfig.TenantId) -or + [string]::IsNullOrWhiteSpace($script:AuthConfig.ClientId)) { + $result.Message = "Missing TenantId or ClientId in stored auth config" + return $result + } + + Write-LogHost " [TOKEN-REFRESH] Attempting re-authentication using AppRegistration..." -ForegroundColor Cyan + + try { + # Disconnect first to ensure clean state + try { + Disconnect-MgGraph -ErrorAction SilentlyContinue | Out-Null + } catch { } + + # Re-authenticate based on stored credential type + $reconnected = $false + + # Try client secret first + if ($script:AuthConfig.ClientSecret) { + Write-LogHost " [TOKEN-REFRESH] Reconnecting with client secret..." -ForegroundColor Gray + $credential = New-Object System.Management.Automation.PSCredential( + $script:AuthConfig.ClientId, + $script:AuthConfig.ClientSecret + ) + Connect-MgGraph -TenantId $script:AuthConfig.TenantId ` + -ClientSecretCredential $credential ` + -NoWelcome -ErrorAction Stop + $reconnected = $true + } + # Try certificate thumbprint + elseif ($script:AuthConfig.CertThumbprint) { + Write-LogHost " [TOKEN-REFRESH] Reconnecting with certificate thumbprint..." -ForegroundColor Gray + Connect-MgGraph -TenantId $script:AuthConfig.TenantId ` + -ClientId $script:AuthConfig.ClientId ` + -CertificateThumbprint $script:AuthConfig.CertThumbprint ` + -NoWelcome -ErrorAction Stop + $reconnected = $true + } + # Try certificate file + elseif ($script:AuthConfig.CertPath) { + Write-LogHost " [TOKEN-REFRESH] Reconnecting with certificate file..." -ForegroundColor Gray + $flags = [System.Security.Cryptography.X509Certificates.X509KeyStorageFlags]::Exportable + $cert = $null + try { + if ($script:AuthConfig.CertPassword) { + $plainPassword = [Runtime.InteropServices.Marshal]::PtrToStringAuto( + [Runtime.InteropServices.Marshal]::SecureStringToBSTR($script:AuthConfig.CertPassword) + ) + $cert = New-Object System.Security.Cryptography.X509Certificates.X509Certificate2( + $script:AuthConfig.CertPath, $plainPassword, $flags + ) + Clear-Variable -Name plainPassword -Force -ErrorAction SilentlyContinue + } + else { + $cert = New-Object System.Security.Cryptography.X509Certificates.X509Certificate2( + $script:AuthConfig.CertPath + ) + } + Connect-MgGraph -TenantId $script:AuthConfig.TenantId ` + -ClientId $script:AuthConfig.ClientId ` + -Certificate $cert ` + -NoWelcome -ErrorAction Stop + $reconnected = $true + } + finally { + if ($cert) { $cert.Dispose() } + } + } + + if ($reconnected) { + # Use reliable token extraction helper (HTTP method primary) + $result.NewToken = Get-GraphAccessToken + if ($result.NewToken) { + $result.Success = $true + $result.Message = "Successfully refreshed token" + + # Update token timing and reset auth failure flags + $script:AuthConfig.TokenIssueTime = Get-Date + $script:TokenAcquiredTime = Get-Date + $script:AuthFailureDetected = $false + $script:Auth401MessageShown = $false # Reset for next auth failure cycle + + Write-LogHost " [TOKEN-REFRESH] Successfully obtained fresh access token" -ForegroundColor Green + Write-LogHost " [TOKEN-REFRESH] Token acquired at $(Get-Date -Format 'HH:mm:ss') - proactive refresh at 30-minute age" -ForegroundColor DarkGray + } + else { + $result.Message = "Reconnected but could not extract access token" + Write-LogHost " [TOKEN-REFRESH] ✗ $($result.Message)" -ForegroundColor Red + } + } + else { + $result.Message = "No valid credential found in stored auth config" + } + } + catch { + $result.Message = "Re-authentication failed: $($_.Exception.Message)" + Write-LogHost " [TOKEN-REFRESH] ✗ $($result.Message)" -ForegroundColor Red + } + + return $result +} + +# ============================================== +# PROACTIVE TOKEN REFRESH FOR LONG-RUNNING OPERATIONS +# ============================================== +function Refresh-GraphTokenIfNeeded { + <# + .SYNOPSIS + Proactively refreshes the Graph access token if it's nearing expiry. + + .DESCRIPTION + Checks SharedAuthState.ExpiresOn and refreshes token if less than 10 minutes + remain before expiry. Uses Azure.Identity for interactive auth modes, or + Invoke-TokenRefresh for AppRegistration mode. + + This function is called from the main thread's job monitoring loop to ensure + thread jobs always have a valid token in SharedAuthState. + + IMPORTANT: Includes cooldown logic to prevent spam - only attempts refresh + once per 5 minutes. If silent refresh fails, sets AuthFailureDetected to + trigger interactive re-auth prompt. + + .PARAMETER BufferMinutes + Refresh if token expires within this many minutes. Default: 5. + + .OUTPUTS + $true - Token was refreshed successfully (silent or interactive) + $false - No refresh needed (token still valid) or within cooldown period + 'Quit' - User chose to quit at the re-auth prompt + + CRITICAL: Callers MUST check for 'Quit' return and handle gracefully! + #> + [CmdletBinding()] + param( + [int]$BufferMinutes = 5 + ) + + # Check if we have shared auth state + if (-not $script:SharedAuthState.ExpiresOn) { + return $false + } + + $now = (Get-Date).ToUniversalTime() + $expiresOn = $script:SharedAuthState.ExpiresOn + $minutesRemaining = ($expiresOn - $now).TotalMinutes + + # PROACTIVE REFRESH FOR APPREG: Refresh at 30-minute token age (not just near expiry) + # AppRegistration can refresh silently, so we do this proactively to avoid 401s + $needsProactiveRefresh = $false + if ($script:AuthConfig.Method -eq 'AppRegistration' -and $script:AuthConfig.CanReauthenticate) { + if ($script:AuthConfig.TokenIssueTime) { + $tokenAge = (Get-Date) - $script:AuthConfig.TokenIssueTime + if ($tokenAge.TotalMinutes -gt 30) { + $needsProactiveRefresh = $true + Write-LogHost " [TOKEN] Token age: $([Math]::Round($tokenAge.TotalMinutes, 1)) minutes - proactive refresh triggered" -ForegroundColor Yellow + } + } + } + + if ($minutesRemaining -gt $BufferMinutes -and -not $needsProactiveRefresh) { + return $false # Token still valid, no refresh needed + } + + # COOLDOWN CHECK: Auth-mode-aware cooldown between refresh attempts + # AppReg: 45 seconds (silent client_credentials grant - cheap and fast) + # Interactive: 5 minutes (avoids spamming browser/prompt windows) + $cooldownMinutes = if ($script:AuthConfig.Method -eq 'AppRegistration') { 0.75 } else { 5 } + if ($script:LastProactiveRefreshAttempt) { + $timeSinceLastAttempt = ((Get-Date) - $script:LastProactiveRefreshAttempt).TotalMinutes + if ($timeSinceLastAttempt -lt $cooldownMinutes) { + return $false + } + } + $script:LastProactiveRefreshAttempt = Get-Date + + # Log appropriate message based on trigger reason + if (-not $needsProactiveRefresh) { + Write-LogHost " [TOKEN] Token expires in $([Math]::Round($minutesRemaining, 1)) minutes - attempting proactive refresh..." -ForegroundColor Yellow + } + + # Try to refresh using Azure.Identity (uses cached MSAL tokens, may prompt if needed) + $tokenInfo = Get-GraphAccessTokenWithExpiry + if ($tokenInfo -and $tokenInfo.Token -ne $script:SharedAuthState.Token) { + # Validate the new token is actually valid (ExpiresOn must be > 2 min in the future) + # Protects against stale MSAL cache returning already-expired tokens (e.g., after process suspension) + $tokenExpiresOn = $tokenInfo.ExpiresOn + $nowUtc = (Get-Date).ToUniversalTime() + $minutesUntilExpiry = ($tokenExpiresOn - $nowUtc).TotalMinutes + if ($minutesUntilExpiry -le 2) { + Write-LogHost " [TOKEN] WARNING: Refreshed token is already expired or near-expiry (expires in $([Math]::Round($minutesUntilExpiry, 1)) min) - forcing full re-authentication" -ForegroundColor Red + # Fall through to Invoke-TokenRefresh -Force below + } else { + # Got a genuinely valid new token + $script:SharedAuthState.Token = $tokenInfo.Token + $script:SharedAuthState.ExpiresOn = $tokenInfo.ExpiresOn + $script:SharedAuthState.LastRefresh = Get-Date + $script:SharedAuthState.RefreshCount++ + + Write-LogHost " [TOKEN] Token refreshed silently (expires: $($tokenInfo.ExpiresOn.ToString('HH:mm:ss')) UTC, refresh #$($script:SharedAuthState.RefreshCount))" -ForegroundColor Green + Write-LogHost " [TOKEN] Note: In-flight queries may still require re-auth before this expiration" -ForegroundColor DarkGray + return $true + } + } + + # Azure.Identity didn't give us a new token (or it was stale), try AppRegistration refresh if available + if ($script:AuthConfig.CanReauthenticate) { + $refreshResult = Invoke-TokenRefresh -Force + if ($refreshResult.Success) { + $script:SharedAuthState.Token = $refreshResult.NewToken + $script:SharedAuthState.ExpiresOn = (Get-Date).ToUniversalTime().AddMinutes(50) + $script:SharedAuthState.LastRefresh = Get-Date + $script:SharedAuthState.RefreshCount++ + $script:AuthConfig.TokenIssueTime = Get-Date # Reset age timer for proactive refresh + + Write-LogHost " [TOKEN] Token refreshed via AppRegistration (refresh #$($script:SharedAuthState.RefreshCount))" -ForegroundColor Green + return $true + } + } + + # SILENT REFRESH FAILED + # For AppRegistration + Force: FATAL exit (true headless operation) + # For AppRegistration without Force: Fall back to interactive prompt + # For interactive modes: Prompt user for re-authentication + Write-LogHost " [TOKEN] [!] Silent token refresh failed - interactive re-authentication required" -ForegroundColor Red + + # AppRegistration mode with -Force: Silent refresh failure is fatal (no interactive fallback for headless runs) + if ($script:AuthConfig.Method -eq 'AppRegistration' -and $Force) { + Write-LogHost " [TOKEN] FATAL: AppRegistration token refresh failed. Cannot continue headless (-Force mode)." -ForegroundColor Red + Write-LogHost " [TOKEN] Check: client secret expiration, certificate validity, or API permissions." -ForegroundColor Yellow + return 'Quit' + } + + # Interactive modes OR AppRegistration without -Force: prompt user for re-authentication + $refreshResult = Invoke-TokenRefreshPrompt + if ($refreshResult -eq 'Quit') { + # User chose to quit - return special value for callers to handle + return 'Quit' + } + + # User pressed R and successfully re-authenticated + # Invoke-TokenRefreshPrompt already updated SharedAuthState and reset AuthFailureDetected + return $true +} + +# ============================================== +# CHECKPOINT/RESUME FUNCTIONS FOR LONG-RUNNING OPERATIONS +# ============================================== + +function Initialize-CheckpointForNewRun { + <# + .SYNOPSIS + Creates new checkpoint structure for a fresh run (not resume mode). + .DESCRIPTION + Initializes checkpoint data structure with all processing parameters, + creates _PARTIAL output filename, and saves initial checkpoint file to disk. + On resume, all parameters are restored from checkpoint to ensure consistency. + #> + param( + [Parameter(Mandatory)] + [string]$OutputPath, + + [Parameter(Mandatory)] + [string]$BaseOutputFileName, + + [Parameter(Mandatory)] + [string]$RunTimestamp, + + [Parameter(Mandatory)] + [datetime]$StartDate, + + [Parameter(Mandatory)] + [datetime]$EndDate, + + [Parameter()] + [hashtable]$AllParameters + ) + + # Create _PARTIAL filename + $fileNameWithoutExt = [System.IO.Path]::GetFileNameWithoutExtension($BaseOutputFileName) + $fileExt = [System.IO.Path]::GetExtension($BaseOutputFileName) + $partialFileName = "${fileNameWithoutExt}_PARTIAL${fileExt}" + $script:PartialOutputPath = Join-Path $OutputPath $partialFileName + + # Create checkpoint file path (hidden file with dot prefix) + $script:CheckpointPath = Join-Path $OutputPath ".pax_checkpoint_${RunTimestamp}.json" + + # Initialize checkpoint data structure with comprehensive parameter snapshot + $script:CheckpointData = @{ + version = 2 # Bumped version for expanded parameter storage + runTimestamp = $RunTimestamp + created = (Get-Date).ToUniversalTime().ToString('o') + lastUpdated = (Get-Date).ToUniversalTime().ToString('o') + parameters = @{ + # Date range + startDate = $StartDate.ToUniversalTime().ToString('o') + endDate = $EndDate.ToUniversalTime().ToString('o') + + # Activity/Record filtering + activityTypes = if ($AllParameters.ActivityTypes) { @($AllParameters.ActivityTypes) } else { @() } + recordTypes = if ($AllParameters.RecordTypes) { @($AllParameters.RecordTypes) } else { @() } + serviceTypes = if ($AllParameters.ServiceTypes) { @($AllParameters.ServiceTypes) } else { @() } + userIds = if ($AllParameters.UserIds) { @($AllParameters.UserIds) } else { @() } + groupNames = if ($AllParameters.GroupNames) { @($AllParameters.GroupNames) } else { @() } + + # Agent filtering + agentId = if ($AllParameters.AgentId) { @($AllParameters.AgentId) } else { @() } + agentsOnly = [bool]$AllParameters.AgentsOnly + excludeAgents = [bool]$AllParameters.ExcludeAgents + + # Prompt filtering + promptFilter = if ($AllParameters.PromptFilter) { $AllParameters.PromptFilter } else { $null } + + # Schema/Explosion settings + explodeArrays = [bool]$AllParameters.ExplodeArrays + explodeDeep = [bool]$AllParameters.ExplodeDeep + explosionThreads = if ($AllParameters.ExplosionThreads) { $AllParameters.ExplosionThreads } else { 0 } + flatDepth = if ($AllParameters.FlatDepth) { $AllParameters.FlatDepth } else { 120 } + streamingSchemaSample = if ($AllParameters.StreamingSchemaSample) { $AllParameters.StreamingSchemaSample } else { 5000 } + streamingChunkSize = if ($AllParameters.StreamingChunkSize) { $AllParameters.StreamingChunkSize } else { 5000 } + + # M365/User info bundles + includeM365Usage = [bool]$AllParameters.IncludeM365Usage + includeUserInfo = [bool]$AllParameters.IncludeUserInfo + includeDSPMForAI = [bool]$AllParameters.IncludeDSPMForAI + includeCopilotInteraction = [bool]$AllParameters.IncludeCopilotInteraction + excludeCopilotInteraction = [bool]$AllParameters.ExcludeCopilotInteraction + + # Partitioning + blockHours = if ($AllParameters.BlockHours) { $AllParameters.BlockHours } else { 0.5 } + partitionHours = if ($AllParameters.PartitionHours) { $AllParameters.PartitionHours } else { 0 } + maxPartitions = if ($AllParameters.MaxPartitions) { $AllParameters.MaxPartitions } else { 160 } + + # Output settings + outputPath = $OutputPath + exportWorkbook = [bool]$AllParameters.ExportWorkbook + combineOutput = [bool]$AllParameters.CombineOutput + + # Auth (method only - no secrets) + auth = if ($AllParameters.Auth) { $AllParameters.Auth } else { 'WebLogin' } + tenantId = if ($AllParameters.TenantId) { $AllParameters.TenantId } else { $null } + clientId = if ($AllParameters.ClientId) { $AllParameters.ClientId } else { $null } + # Note: ClientSecret is NOT stored for security + + # Other settings + resultSize = if ($AllParameters.ResultSize) { $AllParameters.ResultSize } else { 10000 } + maxConcurrency = if ($AllParameters.MaxConcurrency) { $AllParameters.MaxConcurrency } else { 10 } + maxMemoryMB = if ($AllParameters.MaxMemoryMB) { $AllParameters.MaxMemoryMB } else { 0 } + useEOM = [bool]$AllParameters.UseEOM + autoCompleteness = [bool]$AllParameters.AutoCompleteness + includeTelemetry = [bool]$AllParameters.IncludeTelemetry + } + outputFiles = @{ + partialCsv = $partialFileName + finalCsv = $BaseOutputFileName + } + partitions = @{ + total = 0 + blockHours = if ($AllParameters.BlockHours) { $AllParameters.BlockHours } else { 0.5 } + completed = @() + queryCreated = @() + } + statistics = @{ + totalRecordsSaved = 0 + partitionsComplete = 0 + partitionsQueryCreated = 0 + partitionsRemaining = 0 + } + explosion = @{ + status = 'NotStarted' # NotStarted, InProgress, Completed + recordsProcessed = 0 + rowsGenerated = 0 + lastUpdateTime = $null + } + } + + # Save initial checkpoint + Save-CheckpointToDisk + + return $script:PartialOutputPath +} + +function Save-CheckpointToDisk { + <# + .SYNOPSIS + Writes current checkpoint data to disk atomically. + .DESCRIPTION + Uses temp file + rename pattern for atomic writes to prevent corruption. + #> + + if (-not $script:CheckpointPath -or -not $script:CheckpointData) { + return + } + + try { + # Update timestamp + $script:CheckpointData.lastUpdated = (Get-Date).ToUniversalTime().ToString('o') + + # Update statistics + $script:CheckpointData.statistics.partitionsComplete = $script:CheckpointData.partitions.completed.Count + $script:CheckpointData.statistics.partitionsQueryCreated = $script:CheckpointData.partitions.queryCreated.Count + $script:CheckpointData.statistics.partitionsRemaining = $script:CheckpointData.partitions.total - + $script:CheckpointData.statistics.partitionsComplete - + $script:CheckpointData.statistics.partitionsQueryCreated + + # Write to temp file first (atomic write pattern) + $tempPath = "$($script:CheckpointPath).tmp" + $script:CheckpointData | ConvertTo-Json -Depth 10 | Set-Content -Path $tempPath -Encoding UTF8 -Force + + # Remove destination first if it exists (Move-Item -Force doesn't always work on Windows) + if (Test-Path $script:CheckpointPath) { + Remove-Item -Path $script:CheckpointPath -Force -ErrorAction SilentlyContinue + } + + # Rename to final (atomic on most filesystems) + Move-Item -Path $tempPath -Destination $script:CheckpointPath -Force + } + catch { + Write-LogHost " Warning: Failed to save checkpoint: $($_.Exception.Message)" -ForegroundColor Yellow + } +} + +function Save-Checkpoint { + <# + .SYNOPSIS + Updates checkpoint with partition state change and saves to disk. + .PARAMETER PartitionIndex + The partition index (1-based). + .PARAMETER State + 'QueryCreated' or 'Completed' + .PARAMETER QueryId + The server-assigned query ID. + .PARAMETER PartitionStart + Partition start time (optional - looked up from partitionStatus if not provided). + .PARAMETER PartitionEnd + Partition end time (optional - looked up from partitionStatus if not provided). + .PARAMETER RecordCount + Number of records (only for Completed state). + .PARAMETER Force + Just save current checkpoint state to disk without updating partition info. + #> + param( + [Parameter()] + [int]$PartitionIndex, + + [Parameter()] + [ValidateSet('QueryCreated', 'Completed')] + [string]$State, + + [Parameter()] + [string]$QueryId, + + [Parameter()] + [datetime]$PartitionStart, + + [Parameter()] + [datetime]$PartitionEnd, + + [Parameter()] + [int]$RecordCount = 0, + + [Parameter()] + [switch]$Force + ) + + if (-not $script:CheckpointData) { + return + } + + # If -Force is specified, just save current state to disk without updating partition info + if ($Force) { + Save-CheckpointToDisk + return + } + + # For normal calls, require the mandatory parameters + if (-not $PartitionIndex -or -not $State -or -not $QueryId) { + Write-Verbose "Save-Checkpoint: Missing required parameters (PartitionIndex, State, QueryId) - skipping" + return + } + + # Look up partition times from partitionStatus if not provided + if (-not $PartitionStart -or -not $PartitionEnd) { + $partitionInfo = $script:partitionStatus[$PartitionIndex] + if ($partitionInfo -and $partitionInfo.Partition) { + # The Partition object has PStart and PEnd properties + if (-not $PartitionStart -and $partitionInfo.Partition.PStart) { $PartitionStart = $partitionInfo.Partition.PStart } + if (-not $PartitionEnd -and $partitionInfo.Partition.PEnd) { $PartitionEnd = $partitionInfo.Partition.PEnd } + } + + # If still missing, we can't proceed with checkpoint save + if (-not $PartitionStart -or -not $PartitionEnd) { + Write-Verbose "Save-Checkpoint: Could not determine partition times for index $PartitionIndex - skipping checkpoint update" + return + } + } + + $partitionEntry = @{ + index = $PartitionIndex + start = $PartitionStart.ToUniversalTime().ToString('o') + end = $PartitionEnd.ToUniversalTime().ToString('o') + queryId = $QueryId + } + + if ($State -eq 'QueryCreated') { + $partitionEntry.createdAt = (Get-Date).ToUniversalTime().ToString('o') + + # Add to queryCreated list (if not already there) + $existing = $script:CheckpointData.partitions.queryCreated | Where-Object { $_.index -eq $PartitionIndex } + if (-not $existing) { + $script:CheckpointData.partitions.queryCreated += $partitionEntry + } + } + elseif ($State -eq 'Completed') { + $partitionEntry.records = $RecordCount + + # Remove from queryCreated if present + $script:CheckpointData.partitions.queryCreated = @( + $script:CheckpointData.partitions.queryCreated | Where-Object { $_.index -ne $PartitionIndex } + ) + + # Add to completed list (if not already there) + $existing = $script:CheckpointData.partitions.completed | Where-Object { $_.index -eq $PartitionIndex } + if (-not $existing) { + $script:CheckpointData.partitions.completed += $partitionEntry + $script:CheckpointData.statistics.totalRecordsSaved += $RecordCount + } + } + + # Save to disk + Save-CheckpointToDisk +} + +function Read-Checkpoint { + <# + .SYNOPSIS + Loads and validates a checkpoint file. + .PARAMETER CheckpointPath + Path to the checkpoint JSON file. + .OUTPUTS + $true if valid and loaded, $false if invalid. + #> + param( + [Parameter(Mandatory)] + [string]$CheckpointPath + ) + + if (-not (Test-Path $CheckpointPath)) { + Write-LogHost "ERROR: Checkpoint file not found: $CheckpointPath" -ForegroundColor Red + return $false + } + + try { + $data = Get-Content -Path $CheckpointPath -Raw | ConvertFrom-Json -AsHashtable + + # Validate version (supports version 1 and 2) + if (-not $data.version -or $data.version -gt 2) { + Write-LogHost "ERROR: Unsupported checkpoint version: $($data.version). This script supports versions 1-2." -ForegroundColor Red + return $false + } + + # Validate required fields + if (-not $data.runTimestamp -or -not $data.outputFiles -or -not $data.partitions) { + Write-LogHost "ERROR: Checkpoint file is missing required fields" -ForegroundColor Red + return $false + } + + # Get output directory from checkpoint path + $outputDir = Split-Path $CheckpointPath -Parent + $partialCsvPath = Join-Path $outputDir $data.outputFiles.partialCsv + + # Note: We don't require _PARTIAL.csv to exist - the actual data is in .pax_incremental/*.jsonl files + # The _PARTIAL.csv is only created when merging at completion, or may not exist yet + + # Check for incremental save data if there are completed partitions + $completedPartitions = @($data.partitionStates.PSObject.Properties | Where-Object { $_.Value.state -eq 'Completed' }) + if ($completedPartitions.Count -gt 0) { + $incrementalDir = Join-Path $outputDir ".pax_incremental" + $hasIncrementalData = $false + $incrementalRecordCount = 0 + + if (Test-Path $incrementalDir) { + $jsonlFiles = Get-ChildItem -Path $incrementalDir -Filter "*.jsonl" -ErrorAction SilentlyContinue + if ($jsonlFiles -and $jsonlFiles.Count -gt 0) { + $hasIncrementalData = $true + # Count records in files + foreach ($file in $jsonlFiles) { + $incrementalRecordCount += (Get-Content $file.FullName | Measure-Object -Line).Lines + } + } + } + + $expectedRecords = ($completedPartitions | ForEach-Object { $_.Value.recordCount } | Measure-Object -Sum).Sum + + if (-not $hasIncrementalData) { + Write-LogHost "" + Write-LogHost "════════════════════════════════════════════════════════════════════════════════" -ForegroundColor Red + Write-LogHost " WARNING: INCREMENTAL DATA MISSING" -ForegroundColor Red + Write-LogHost "════════════════════════════════════════════════════════════════════════════════" -ForegroundColor Red + Write-LogHost "" + Write-LogHost " Checkpoint shows $($completedPartitions.Count) completed partition(s) with ~$expectedRecords records," -ForegroundColor Yellow + Write-LogHost " but the .pax_incremental folder is missing or empty." -ForegroundColor Yellow + Write-LogHost "" + Write-LogHost " Expected location: $incrementalDir" -ForegroundColor White + Write-LogHost "" + Write-LogHost " If you continue, data from completed partitions will be LOST." -ForegroundColor Red + Write-LogHost " The remaining partitions will be re-queried, but previous data cannot be recovered." -ForegroundColor Red + Write-LogHost "" + Write-LogHost " OPTIONS:" -ForegroundColor Cyan + Write-LogHost " 1. Restore the .pax_incremental folder from backup (if available)" -ForegroundColor White + Write-LogHost " 2. Start a fresh run without -Resume (will re-query all partitions)" -ForegroundColor White + Write-LogHost " 3. Continue anyway and accept data loss from completed partitions" -ForegroundColor White + Write-LogHost "" + Write-LogHost "════════════════════════════════════════════════════════════════════════════════" -ForegroundColor Red + Write-LogHost "" + + $response = Read-Host "Continue with potential data loss? (yes/no)" + if ($response -notmatch '^y(es)?$') { + Write-LogHost "Resume cancelled. Consider starting a fresh run." -ForegroundColor Yellow + return $false + } + Write-LogHost "Continuing with resume despite missing incremental data..." -ForegroundColor Yellow + } elseif ($incrementalRecordCount -lt ($expectedRecords * 0.9)) { + # Warn if incremental count is significantly less than expected (allow 10% variance for counting differences) + Write-LogHost "" + Write-LogHost " [WARN] Incremental data may be incomplete:" -ForegroundColor Yellow + Write-LogHost " Checkpoint expects ~$expectedRecords records from completed partitions" -ForegroundColor Yellow + Write-LogHost " Found $incrementalRecordCount records in .pax_incremental" -ForegroundColor Yellow + Write-LogHost "" + } + } + + # Load into script scope + $script:CheckpointPath = $CheckpointPath + $script:CheckpointData = $data + $script:PartialOutputPath = $partialCsvPath + $script:IsResumeMode = $true + + return $true + } + catch { + Write-LogHost "ERROR: Failed to parse checkpoint file: $($_.Exception.Message)" -ForegroundColor Red + return $false + } +} + +function Find-Checkpoints { + <# + .SYNOPSIS + Discovers checkpoint files in the specified output directory. + .PARAMETER OutputPath + Directory to search for checkpoint files. + .OUTPUTS + Array of checkpoint info objects sorted by LastUpdated (newest first). + #> + param( + [Parameter(Mandatory)] + [string]$OutputPath + ) + + if (-not (Test-Path $OutputPath)) { + return @() + } + + $checkpointFiles = Get-ChildItem -Path $OutputPath -Filter ".pax_checkpoint_*.json" -Force -ErrorAction SilentlyContinue + + if (-not $checkpointFiles -or $checkpointFiles.Count -eq 0) { + return @() + } + + $checkpoints = @() + + foreach ($file in $checkpointFiles) { + try { + $data = Get-Content -Path $file.FullName -Raw | ConvertFrom-Json -AsHashtable + + $checkpoints += [PSCustomObject]@{ + Path = $file.FullName + FileName = $file.Name + RunTimestamp = $data.runTimestamp + LastUpdated = script:Parse-DateSafe $data.lastUpdated + StartDate = if ($data.parameters.startDate) { $d = script:Parse-DateSafe $data.parameters.startDate; if ($d) { $d.ToString('yyyy-MM-dd') } else { 'Unknown' } } else { 'Unknown' } + EndDate = if ($data.parameters.endDate) { $d = script:Parse-DateSafe $data.parameters.endDate; if ($d) { $d.ToString('yyyy-MM-dd') } else { 'Unknown' } } else { 'Unknown' } + PartitionsComplete = $data.statistics.partitionsComplete + PartitionsTotal = $data.partitions.total + RecordsSaved = $data.statistics.totalRecordsSaved + } + } + catch { + # Skip invalid checkpoint files + continue + } + } + + # Sort by LastUpdated descending (newest first) + return $checkpoints | Sort-Object -Property LastUpdated -Descending +} + +function Select-Checkpoint { + <# + .SYNOPSIS + Prompts user to select from multiple checkpoint files. + .PARAMETER Checkpoints + Array of checkpoint info objects from Find-Checkpoints. + .OUTPUTS + Selected checkpoint path, or $null if user quits. + #> + param( + [Parameter(Mandatory)] + [array]$Checkpoints + ) + + Write-Host "" + Write-Host "════════════════════════════════════════════════════════════════════════════════" -ForegroundColor Cyan + Write-Host " Multiple checkpoint files found. Select one to resume:" -ForegroundColor Cyan + Write-Host "════════════════════════════════════════════════════════════════════════════════" -ForegroundColor Cyan + Write-Host "" + + for ($i = 0; $i -lt $Checkpoints.Count; $i++) { + $cp = $Checkpoints[$i] + $num = $i + 1 + Write-Host " [$num] $($cp.LastUpdated.ToString('yyyy-MM-dd HH:mm')) | $($cp.StartDate) to $($cp.EndDate) | $($cp.PartitionsComplete)/$($cp.PartitionsTotal) partitions | $($cp.RecordsSaved.ToString('N0')) records" -ForegroundColor White + Write-Host " $($cp.FileName)" -ForegroundColor DarkGray + Write-Host "" + } + + Write-Host " [Q] Quit (do not resume)" -ForegroundColor Yellow + Write-Host "" + + Send-PromptNotification + while ($true) { + $choice = Read-Host " Enter selection (1-$($Checkpoints.Count)) or 'Q' to quit" + + if ($choice -eq 'Q' -or $choice -eq 'q') { + return $null + } + + $selection = 0 + if ([int]::TryParse($choice, [ref]$selection)) { + if ($selection -ge 1 -and $selection -le $Checkpoints.Count) { + return $Checkpoints[$selection - 1] + } + } + + Write-Host " Invalid selection. Please enter a number 1-$($Checkpoints.Count) or 'Q' to quit." -ForegroundColor Red + } +} + +function Remove-Checkpoint { + <# + .SYNOPSIS + Deletes checkpoint file after successful completion. + #> + + if ($script:CheckpointPath -and (Test-Path $script:CheckpointPath)) { + try { + Remove-Item -Path $script:CheckpointPath -Force + } + catch { + Write-LogHost " Warning: Could not delete checkpoint file: $($_.Exception.Message)" -ForegroundColor Yellow + } + } + + $script:CheckpointPath = $null + $script:CheckpointData = $null +} + +function Get-PartitionsToProcess { + <# + .SYNOPSIS + Categorizes partitions based on checkpoint state for resume. + .PARAMETER AllPartitions + Array of all partition objects for the date range. + .OUTPUTS + Hashtable with ToSkip, ToFetchOnly, and ToCreateAndFetch arrays. + #> + param( + [Parameter(Mandatory)] + [array]$AllPartitions + ) + + $result = @{ + ToSkip = @() # Already completed - skip entirely + ToFetchOnly = @() # Query exists on server - just fetch records + ToCreateAndFetch = @() # Start fresh - create query then fetch + } + + if (-not $script:CheckpointData) { + # No checkpoint - all partitions need full processing + $result.ToCreateAndFetch = $AllPartitions + return $result + } + + $completedIndices = @{} + $queryCreatedIndices = @{} + + # Build lookup tables from checkpoint (use string keys for reliable comparison) + foreach ($cp in $script:CheckpointData.partitions.completed) { + $completedIndices["$($cp.index)"] = $cp + } + foreach ($qc in $script:CheckpointData.partitions.queryCreated) { + $queryCreatedIndices["$($qc.index)"] = $qc + } + + # Categorize each partition + foreach ($partition in $AllPartitions) { + $idx = "$($partition.Index)" # Convert to string for reliable comparison + + if ($completedIndices.ContainsKey($idx)) { + $result.ToSkip += $partition + } + elseif ($queryCreatedIndices.ContainsKey($idx)) { + # Add QueryId to partition for fetch-only processing + $partition | Add-Member -NotePropertyName 'StoredQueryId' -NotePropertyValue $queryCreatedIndices[$idx].queryId -Force + $result.ToFetchOnly += $partition + } + else { + $result.ToCreateAndFetch += $partition + } + } + + return $result +} + +function Test-ShouldPromptTokenRefresh { + <# + .SYNOPSIS + Checks if token refresh handling is needed. + .DESCRIPTION + Reactive detection: Returns true only when a 401 Unauthorized error + has been detected, indicating the token has actually expired. + No proactive time-based prompts - only triggers on real auth failures. + + For AppRegistration mode, this still returns true on 401 so the + auth handling block can perform automatic silent token refresh. + For interactive modes, this triggers a user prompt. + .OUTPUTS + $true if auth failure detected and refresh handling is needed, $false otherwise. + #> + + # Reactive detection: Return true when 401 error detected (for ALL auth methods) + # The auth handling block will determine whether to auto-refresh (AppRegistration) + # or prompt the user (interactive modes) + return $script:AuthFailureDetected +} + +function Invoke-TokenRefreshPrompt { + <# + .SYNOPSIS + Handles token refresh for interactive auth modes (WebLogin/DeviceCode). + .DESCRIPTION + Triggered reactively when a 401 Unauthorized error is detected. + First attempts silent token refresh (SDK may have valid refresh token cached). + If silent refresh fails, prompts user to re-authenticate or quit. + .OUTPUTS + 'Refreshed' - Token refreshed successfully (silent or interactive) + 'Quit' - User chose to exit + #> + + $tokenAge = if ($script:TokenAcquiredTime) { (Get-Date) - $script:TokenAcquiredTime } else { $null } + + # ═══════════════════════════════════════════════════════════════════════════ + # NOTE: We intentionally do NOT attempt automatic token refresh here. + # On Windows, Connect-MgGraph with InteractiveBrowserCredential ALWAYS opens + # a browser popup. If the user is away, this popup sits waiting, and when + # they return and press 'R' to re-auth, a SECOND popup appears. + # By going straight to the user prompt, we ensure only ONE popup when ready. + # ═══════════════════════════════════════════════════════════════════════════ + + # ═══════════════════════════════════════════════════════════════════════════ + # Prompt user for interactive re-authentication (single popup, user-initiated) + # ═══════════════════════════════════════════════════════════════════════════ + + # Get progress info for display + $completedCount = if ($script:CheckpointData) { $script:CheckpointData.statistics.partitionsComplete } else { 0 } + $totalCount = if ($script:CheckpointData) { $script:CheckpointData.partitions.total } else { 0 } + $recordsSaved = if ($script:CheckpointData) { $script:CheckpointData.statistics.totalRecordsSaved } else { 0 } + + Write-Host "" + Write-Host "════════════════════════════════════════════════════════════════════════════════" -ForegroundColor Red + Write-Host " [!] AUTHENTICATION EXPIRED - Re-authentication Required" -ForegroundColor Red + Write-Host "════════════════════════════════════════════════════════════════════════════════" -ForegroundColor Red + Write-Host "" + Write-Host " Authentication failure detected (401 Unauthorized)." -ForegroundColor White + Write-Host " Your session token has expired. Please re-authenticate to continue." -ForegroundColor White + Write-Host "" + if ($tokenAge) { + Write-Host " Session duration: $([Math]::Round($tokenAge.TotalMinutes, 0)) minutes" -ForegroundColor Gray + } + Write-Host " Progress: $completedCount/$totalCount partitions complete | $($recordsSaved.ToString('N0')) records saved to disk" -ForegroundColor White + Write-Host " Auth method: $($Auth) (interactive)" -ForegroundColor Gray + Write-Host "" + Write-Host " [R] Re-authenticate now (recommended)" -ForegroundColor Green + Write-Host " [Q] Quit and save progress (resume later with -Resume)" -ForegroundColor Cyan + Write-Host "" + Write-Host "════════════════════════════════════════════════════════════════════════════════" -ForegroundColor Red + + Send-PromptNotification + while ($true) { + $choice = Read-Host " Enter choice (R/Q)" + + switch ($choice.ToUpper()) { + 'R' { + Write-Host "" + Write-Host " Re-authenticating..." -ForegroundColor Cyan + + try { + # Disconnect and reconnect + Disconnect-MgGraph -ErrorAction SilentlyContinue | Out-Null + Connect-PurviewAudit -AuthMethod $Auth -UseEOMMode $false + + # Update token timing and reset auth failure flags + $script:TokenAcquiredTime = Get-Date + $script:AuthFailureDetected = $false + $script:Auth401MessageShown = $false # Reset for next auth failure cycle + + Write-Host " Re-authentication successful. Resuming execution..." -ForegroundColor Green + Write-Host " Failed partitions will be retried with fresh token." -ForegroundColor Green + Write-Host "" + + return 'Refreshed' + } + catch { + Write-Host " ✗ Re-authentication failed: $($_.Exception.Message)" -ForegroundColor Red + Write-Host " Please try again or quit." -ForegroundColor Yellow + } + } + 'Q' { + Write-Host "" + Show-CheckpointExitMessage + return 'Quit' + } + default { + Write-Host " Invalid choice. Please enter R or Q." -ForegroundColor Red + } + } + } +} + +function Merge-IncrementalSaves { + <# + .SYNOPSIS + Merges incremental JSON save files into the main allLogs collection. + .DESCRIPTION + Called at the end of execution to consolidate partition data that was + saved incrementally during execution. In resume mode, only merges data + from partitions that were skipped (already completed at start of run), + since partitions completed during this run already have data in memory. + .PARAMETER AllLogs + Reference to the synchronized ArrayList to add records to. + .PARAMETER OutputDirectory + The output directory containing the .pax_incremental folder. + .PARAMETER CleanupAfterMerge + If true, deletes the incremental files after successful merge. + .PARAMETER OnlyPartitionIndices + If specified, only merge files for these partition indices. + Used in resume mode to avoid double-counting data from partitions + that completed during this run. + #> + param( + [Parameter(Mandatory = $true)] + [System.Collections.ArrayList]$AllLogs, + + [Parameter(Mandatory = $true)] + [string]$OutputDirectory, + + [Parameter(Mandatory = $false)] + [bool]$CleanupAfterMerge = $true, + + [Parameter(Mandatory = $false)] + [int[]]$OnlyPartitionIndices = $null + ) + + $incrementalDir = Join-Path $OutputDirectory ".pax_incremental" + + if (-not (Test-Path $incrementalDir)) { + return 0 + } + + $incrementalFiles = Get-ChildItem -Path $incrementalDir -Filter "*.jsonl" -ErrorAction SilentlyContinue + + if (-not $incrementalFiles -or $incrementalFiles.Count -eq 0) { + return 0 + } + + $mergedCount = 0 + $filesProcessed = 0 + $filesSkipped = 0 + + $filterMsg = if ($OnlyPartitionIndices) { " (filtering for partitions: $($OnlyPartitionIndices -join ', '))" } else { "" } + Write-LogHost " [MERGE] Found $($incrementalFiles.Count) incremental save files$filterMsg..." -ForegroundColor Cyan + + foreach ($file in $incrementalFiles) { + try { + # If filtering by partition indices, check if this file matches + # Filename format: Part{N}_timestamp_qid-{QueryId}_Nrecords.jsonl (recovery files use qid-recovery) + if ($OnlyPartitionIndices) { + $partMatch = [regex]::Match($file.Name, '^Part(\d+)_') + if ($partMatch.Success) { + $filePartitionIndex = [int]$partMatch.Groups[1].Value + if ($filePartitionIndex -notin $OnlyPartitionIndices) { + $filesSkipped++ + # Don't delete - this file is for a partition completed in this run + continue + } + } + } + + # Read JSON Lines (NDJSON) format - one record per line + $lines = Get-Content -Path $file.FullName -Encoding utf8 + $fileRecordCount = 0 + + foreach ($line in $lines) { + if ([string]::IsNullOrWhiteSpace($line)) { continue } + + try { + $record = $line | ConvertFrom-Json + [void]$AllLogs.Add($record) + $fileRecordCount++ + } catch { + # Skip malformed lines but continue processing + Write-Verbose "Skipped malformed line in $($file.Name)" + } + } + + $mergedCount += $fileRecordCount + if ($fileRecordCount -gt 0) { $filesProcessed++ } + + if ($CleanupAfterMerge) { + Remove-Item -Path $file.FullName -Force -ErrorAction SilentlyContinue + } + } + catch { + Write-LogHost " [WARN] Failed to merge $($file.Name): $($_.Exception.Message)" -ForegroundColor Yellow + } + } + + # Clean up directory if empty + if ($CleanupAfterMerge) { + $remainingFiles = Get-ChildItem -Path $incrementalDir -ErrorAction SilentlyContinue + if (-not $remainingFiles -or $remainingFiles.Count -eq 0) { + Remove-Item -Path $incrementalDir -Force -ErrorAction SilentlyContinue + } + } + + if ($mergedCount -gt 0) { + Write-LogHost " [MERGE] Merged $($mergedCount.ToString('N0')) records from $filesProcessed incremental files" -ForegroundColor Green + } + + return $mergedCount +} + +function Merge-IncrementalSaves-Streaming { + <# + .SYNOPSIS + Memory-efficient streaming merge of incremental JSONL files directly to CSV. + .DESCRIPTION + Instead of loading all records into memory, this function streams records + from incremental JSONL files directly to the final CSV output using batched + writes and explicit garbage collection between files. This prevents memory + exhaustion when merging millions of records. + .PARAMETER OutputFile + The final CSV file path to write merged data to. + .PARAMETER OutputDirectory + The output directory containing the .pax_incremental folder. + .PARAMETER OnlyPartitionIndices + If specified, only merge files for these partition indices. + .PARAMETER Columns + The column schema to use for CSV output. If not specified, uses default 7-column schema. + .PARAMETER RunTimestamp + The script run timestamp used to filter incremental files to only those from the current run. + Prevents stale files from prior runs being merged into the output. + .RETURNS + The total number of records merged. + #> + param( + [Parameter(Mandatory = $true)] + [string]$OutputFile, + + [Parameter(Mandatory = $true)] + [string]$OutputDirectory, + + [Parameter(Mandatory = $false)] + [int[]]$OnlyPartitionIndices = $null, + + [Parameter(Mandatory = $false)] + [string[]]$Columns = $null, + + [Parameter(Mandatory = $false)] + [System.Collections.Generic.HashSet[string]]$ExcludeRecordIds = $null, + + [Parameter(Mandatory = $false)] + [ref]$ActivityCounts = $null, + + [Parameter(Mandatory = $false)] + [string]$RunTimestamp = $null + ) + + $incrementalDir = Join-Path $OutputDirectory ".pax_incremental" + + if (-not (Test-Path $incrementalDir)) { + Write-LogHost " [MERGE-STREAM] No incremental directory found" -ForegroundColor Yellow + return 0 + } + + # Filter by run timestamp to avoid merging stale files from prior runs + $jsonlFilter = if ($RunTimestamp) { "*_${RunTimestamp}_*.jsonl" } else { "*.jsonl" } + $allFiles = Get-ChildItem -Path $incrementalDir -Filter $jsonlFilter -ErrorAction SilentlyContinue + if (-not $allFiles -or $allFiles.Count -eq 0) { + Write-LogHost " [MERGE-STREAM] No incremental files found$(if ($RunTimestamp) { " for run $RunTimestamp" })" -ForegroundColor Yellow + return 0 + } + + # Sort files by partition number for consistent output ordering + $files = $allFiles | Sort-Object { + if ($_.Name -match 'Part(\d+)_') { [int]$Matches[1] } else { 999999 } + } + + # Filter by partition indices if specified + if ($OnlyPartitionIndices) { + $files = $files | Where-Object { + $partMatch = [regex]::Match($_.Name, '^Part(\d+)_') + if ($partMatch.Success) { + [int]$partMatch.Groups[1].Value -in $OnlyPartitionIndices + } else { + $false + } + } + } + + if (-not $files -or @($files).Count -eq 0) { + Write-LogHost " [MERGE-STREAM] No matching incremental files for specified partitions" -ForegroundColor Yellow + return 0 + } + + # When multiple JSONL files exist for the same partition (from retries with different QueryIds), + # keep only the largest file per partition. This prevents duplicate records from partial first attempts + # being merged alongside the full retry result. + $filesByPartition = @{} + foreach ($f in @($files)) { + if ($f.Name -match '^Part(\d+)_') { + $pIdx = [int]$Matches[1] + if (-not $filesByPartition.ContainsKey($pIdx) -or $f.Length -gt $filesByPartition[$pIdx].Length) { + $filesByPartition[$pIdx] = $f + } + } + } + $deduplicatedFiles = @($filesByPartition.Values | Sort-Object { if ($_.Name -match 'Part(\d+)_') { [int]$Matches[1] } else { 999999 } }) + $removedFileCount = @($files).Count - $deduplicatedFiles.Count + if ($removedFileCount -gt 0) { + Write-LogHost " [MERGE-STREAM] Removed $removedFileCount duplicate partition file(s) from prior retry attempts — keeping largest per partition" -ForegroundColor DarkYellow + } + $files = $deduplicatedFiles + + $fileCount = @($files).Count + Write-LogHost " [MERGE-STREAM] Streaming $fileCount incremental files to CSV..." -ForegroundColor Cyan + + # Use default 8-column schema if not specified (non-explosion mode) + # Column names match Purview UI audit export: RecordId, CreationDate, RecordType, Operation, UserId, AuditData, AssociatedAdminUnits, AssociatedAdminUnitsNames + if (-not $Columns) { + $Columns = @('RecordId', 'CreationDate', 'RecordType', 'Operation', 'UserId', 'AuditData', 'AssociatedAdminUnits', 'AssociatedAdminUnitsNames') + } + + $totalMerged = 0 + $filesProcessed = 0 + $headerWritten = $false + $batchSize = 5000 + $startTime = Get-Date + $lastProgressTime = Get-Date + + # Track seen RecordIds for deduplication (seed with any in-memory RecordIds already written to CSV) + $seenIds = if ($ExcludeRecordIds) { New-Object System.Collections.Generic.HashSet[string] ($ExcludeRecordIds) } else { New-Object System.Collections.Generic.HashSet[string] } + $duplicatesSkipped = 0 + + foreach ($file in $files) { + $filesProcessed++ + $partNum = if ($file.Name -match 'Part(\d+)_') { $Matches[1] } else { "?" } + + try { + # Open CSV writer on first file with data + if (-not $headerWritten) { + Open-CsvWriter -Path $OutputFile -Columns $Columns + $headerWritten = $true + } + + # Stream records from this file in batches + $batch = New-Object System.Collections.Generic.List[object] + $fileRecords = 0 + + # Use StreamReader instead of Get-Content pipeline for ~5-10x faster file reading + $reader = [System.IO.StreamReader]::new($file.FullName, [System.Text.Encoding]::UTF8) + while ($null -ne ($line = $reader.ReadLine())) { + if (-not [string]::IsNullOrWhiteSpace($line)) { + try { + $record = $line | ConvertFrom-Json + + # Deduplicate by RecordId + $recordId = $null + if ($record.Identity) { $recordId = $record.Identity } + elseif ($record.Id) { $recordId = $record.Id } + elseif ($record.RecordId) { $recordId = $record.RecordId } + + if ($recordId -and $seenIds.Contains($recordId)) { + $script:StreamingMergeDuplicatesSkipped++ + continue # Skip duplicate — 'continue' works in while loop (was 'return' in ForEach-Object) + } + if ($recordId) { [void]$seenIds.Add($recordId) } + + # Client-side date-range trimming for streaming path + if ($script:TrimStartDateUTC -or $script:TrimEndDateUTC) { + $recDate = script:Parse-DateSafe $record.CreationDate + if ($recDate) { + $recDateUtc = $recDate.ToUniversalTime() + if ($script:TrimStartDateUTC -and $recDateUtc -lt $script:TrimStartDateUTC) { $script:DateTrimCount++; continue } + if ($script:TrimEndDateUTC -and $recDateUtc -ge $script:TrimEndDateUTC) { $script:DateTrimCount++; continue } + } + } + + # Parse AuditData for Operation if needed + $auditData = $record.AuditData + $parsedAudit = if ($record.PSObject.Properties['_ParsedAuditData']) { + $record._ParsedAuditData + } else { + try { $auditData | ConvertFrom-Json -ErrorAction SilentlyContinue } catch { $null } + } + $opValue = if ($parsedAudit -and $parsedAudit.Operation) { $parsedAudit.Operation } else { $record.Operations } + + # Track per-activity counts for Activity Type Breakdown + if ($ActivityCounts -and $opValue) { + if (-not $ActivityCounts.Value.ContainsKey($opValue)) { $ActivityCounts.Value[$opValue] = 0 } + $ActivityCounts.Value[$opValue]++ + } + + # Create normalized record matching expected schema (column names match Purview UI export) + $normalizedRecord = [pscustomobject]@{ + RecordId = if ($record.RecordId) { $record.RecordId } elseif ($record.Identity) { $record.Identity } elseif ($record.Id) { $record.Id } elseif ($parsedAudit -and $parsedAudit.Id) { $parsedAudit.Id } else { $null } + CreationDate = if ($record.CreationDate) { + $dt = script:Parse-DateSafe $record.CreationDate; if ($dt) { $dt.ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ss.fffZ') } else { $record.CreationDate } + } else { '' } + RecordType = $record.RecordType + Operation = $opValue + UserId = if ($record.UserId) { $record.UserId } elseif ($record.UserIds) { $record.UserIds } else { '' } + AuditData = $auditData + AssociatedAdminUnits = $(try { if ($parsedAudit.AssociatedAdminUnits) { $parsedAudit.AssociatedAdminUnits } elseif ($record.AssociatedAdminUnits) { $record.AssociatedAdminUnits } else { '' } } catch { '' }) + AssociatedAdminUnitsNames = $(try { if ($parsedAudit.AssociatedAdminUnitsNames) { $parsedAudit.AssociatedAdminUnitsNames } elseif ($record.AssociatedAdminUnitsNames) { $record.AssociatedAdminUnitsNames } else { '' } } catch { '' }) + } + + $batch.Add($normalizedRecord) + $fileRecords++ + + # Write batch when full + if ($batch.Count -ge $batchSize) { + Write-CsvRows -Rows $batch -Columns $Columns + $totalMerged += $batch.Count + $batch.Clear() + } + } catch { + # Skip malformed lines + Write-Verbose "Skipped malformed line in $($file.Name): $($_.Exception.Message)" + } + } + } + # Dispose StreamReader to release file handle + if ($reader) { $reader.Dispose(); $reader = $null } + + # Flush remaining batch for this file + if ($batch.Count -gt 0) { + Write-CsvRows -Rows $batch -Columns $Columns + $totalMerged += $batch.Count + $batch.Clear() + } + + # Progress reporting every 30 seconds + $now = Get-Date + if (($now - $lastProgressTime).TotalSeconds -ge 30) { + $elapsed = ($now - $startTime).TotalSeconds + $rate = if ($elapsed -gt 0) { [int]($totalMerged / $elapsed) } else { 0 } + Write-LogHost " [MERGE-STREAM] Progress: $filesProcessed/$fileCount files | $($totalMerged.ToString('N0')) records | ~$rate rec/sec" -ForegroundColor DarkCyan + $lastProgressTime = $now + } + + # Reduced GC frequency from every file to every 5th file for better throughput + $batch = $null + if ($filesProcessed % 5 -eq 0) { + [GC]::Collect() + [GC]::WaitForPendingFinalizers() + } + + } catch { + # Ensure StreamReader is disposed on error to release file handle + if ($reader) { try { $reader.Dispose() } catch {} ; $reader = $null } + Write-LogHost " [WARN] Failed to stream merge $($file.Name): $($_.Exception.Message)" -ForegroundColor Yellow + } + } + + # Close CSV writer + if ($headerWritten) { + Close-CsvWriter + } + + # Final stats + $totalElapsed = (Get-Date) - $startTime + $finalRate = if ($totalElapsed.TotalSeconds -gt 0) { [int]($totalMerged / $totalElapsed.TotalSeconds) } else { 0 } + + Write-LogHost " [MERGE-STREAM] Streaming merge complete: $($totalMerged.ToString('N0')) records from $filesProcessed files" -ForegroundColor Green + Write-LogHost " [MERGE-STREAM] Time: $([Math]::Round($totalElapsed.TotalSeconds, 1))s | Rate: $finalRate rec/sec" -ForegroundColor DarkGray + if ($duplicatesSkipped -gt 0 -or $script:StreamingMergeDuplicatesSkipped -gt 0) { + $totalDupes = $duplicatesSkipped + $script:StreamingMergeDuplicatesSkipped + Write-LogHost " [MERGE-STREAM] Duplicates skipped: $totalDupes" -ForegroundColor DarkGray + } + + # Clear the HashSet to free memory + $seenIds.Clear() + $seenIds = $null + [GC]::Collect() + + return $totalMerged +} + +function Show-CheckpointExitMessage { + <# + .SYNOPSIS + Displays checkpoint save confirmation and resume instructions. + #> + + if (-not $script:CheckpointData -or -not $script:CheckpointPath) { + return + } + + $completedCount = if ($script:CheckpointData.statistics.partitionsComplete) { $script:CheckpointData.statistics.partitionsComplete } else { 0 } + $queryCreatedCount = if ($script:CheckpointData.statistics.partitionsQueryCreated) { $script:CheckpointData.statistics.partitionsQueryCreated } else { 0 } + $totalCount = if ($script:CheckpointData.partitions.total) { $script:CheckpointData.partitions.total } else { 0 } + $remaining = $totalCount - $completedCount - $queryCreatedCount + $recordsSaved = if ($script:CheckpointData.statistics.totalRecordsSaved) { $script:CheckpointData.statistics.totalRecordsSaved } else { 0 } + + Write-Host "" + Write-Host "════════════════════════════════════════════════════════════════════════════════" -ForegroundColor Green + Write-Host " PROGRESS SAVED" -ForegroundColor Green + Write-Host "════════════════════════════════════════════════════════════════════════════════" -ForegroundColor Green + Write-Host "" + Write-Host " Checkpoint: $(Split-Path $script:CheckpointPath -Leaf)" -ForegroundColor White + Write-Host " Partial data: $(if ($script:PartialOutputPath) { Split-Path $script:PartialOutputPath -Leaf } else { '(incremental saves in .pax_incremental/)' })" -ForegroundColor White + Write-Host " Records saved: $($recordsSaved.ToString('N0'))" -ForegroundColor White + Write-Host " Partitions: $completedCount/$totalCount complete" -NoNewline -ForegroundColor White + if ($queryCreatedCount -gt 0) { + Write-Host ", $queryCreatedCount queries pending" -NoNewline -ForegroundColor White + } + if ($remaining -gt 0) { + Write-Host ", $remaining not started" -ForegroundColor White + } + else { + Write-Host "" -ForegroundColor White + } + Write-Host "" + Write-Host " To resume later:" -ForegroundColor Cyan + Write-Host " -Resume -OutputPath `"$(Split-Path $script:CheckpointPath -Parent)`"" -ForegroundColor White + Write-Host "" + Write-Host " Or with explicit checkpoint file:" -ForegroundColor Cyan + Write-Host " -Resume `"$($script:CheckpointPath)`"" -ForegroundColor White + Write-Host "" + Write-Host "════════════════════════════════════════════════════════════════════════════════" -ForegroundColor Green +} + +function Complete-CheckpointRun { + <# + .SYNOPSIS + Finalizes successful run: renames _PARTIAL file, deletes checkpoint. + .PARAMETER FinalOutputPath + The final output path (without _PARTIAL). + #> + param( + [Parameter(Mandatory)] + [string]$FinalOutputPath + ) + + if (-not $script:PartialOutputPath -or -not (Test-Path $script:PartialOutputPath)) { + return + } + + try { + # Rename _PARTIAL to final + if (Test-Path $FinalOutputPath) { + # Final file already exists - add timestamp to avoid overwrite + $dir = Split-Path $FinalOutputPath -Parent + $name = [System.IO.Path]::GetFileNameWithoutExtension($FinalOutputPath) + $ext = [System.IO.Path]::GetExtension($FinalOutputPath) + $timestamp = Get-Date -Format 'yyyyMMdd_HHmmss' + $FinalOutputPath = Join-Path $dir "${name}_${timestamp}${ext}" + } + + Move-Item -Path $script:PartialOutputPath -Destination $FinalOutputPath -Force + + # Rename log file (remove _PARTIAL suffix) + $partialLogPath = $script:LogFile + if ($partialLogPath -and (Test-Path $partialLogPath) -and $partialLogPath -match '_PARTIAL\.log$') { + $finalLogPath = $partialLogPath -replace '_PARTIAL\.log$', '.log' + if (Test-Path $finalLogPath) { + $logDir = Split-Path $finalLogPath -Parent + $logName = [System.IO.Path]::GetFileNameWithoutExtension($finalLogPath) + $timestamp = Get-Date -Format 'yyyyMMdd_HHmmss' + $finalLogPath = Join-Path $logDir "${logName}_${timestamp}.log" + } + Move-Item -Path $partialLogPath -Destination $finalLogPath -Force + $script:LogFile = $finalLogPath + } + + # Delete checkpoint + Remove-Checkpoint + + $script:PartialOutputPath = $null + } + catch { + Write-LogHost " Warning: Could not finalize output file: $($_.Exception.Message)" -ForegroundColor Yellow + } +} + +function Get-UserLicenseData { + <# + .SYNOPSIS + Fetches user license information from Microsoft Graph API. + + .DESCRIPTION + Queries Graph API for subscribedSkus and builds lookup hashtables for: + - User license assignments (userId -> list of SKU names) + - Copilot license detection (userId -> has Copilot license) + + Uses two-tier Copilot license detection: + 1. Checks known Copilot SKU IDs (hardcoded list from $script:CopilotSkuIds) + 2. Pattern matches SKU names containing "Copilot" (catches new/promotional variants) + + .OUTPUTS + Hashtable with two keys: + - UserLicenses: @{userId = @('SKU1', 'SKU2')} + - UserHasCopilot: @{userId = $true/$false} + #> + + Write-LogHost "" + Write-LogHost "Fetching license data from Microsoft Graph API..." -ForegroundColor Cyan + + try { + # Fetch all subscribed SKUs with assigned user data + $uri = "https://graph.microsoft.com/v1.0/subscribedSkus" + $response = Invoke-MgGraphRequest -Method GET -Uri $uri -ErrorAction Stop + $skus = $response.value + + Write-LogHost " Found $($skus.Count) SKU(s) in tenant" -ForegroundColor Gray + + # Build SKU lookup: skuId -> skuPartNumber + $skuLookup = @{} + $copilotSkuIds = $script:CopilotSkuIds.Keys + + foreach ($sku in $skus) { + $skuId = $sku.skuId + $skuName = $sku.skuPartNumber + $skuLookup[$skuId] = $skuName + } + + # Build user license hashtables + $userLicenses = @{} # userId -> @('SKU1', 'SKU2') + $userHasCopilot = @{} # userId -> $true/$false + + # Fetch all users with licenses in batches + Write-LogHost " Fetching user license assignments..." -ForegroundColor Gray + $userUri = "https://graph.microsoft.com/v1.0/users?`$select=id,userPrincipalName,assignedLicenses&`$top=999" + $userCount = 0 + $copilotUserCount = 0 + + do { + $userResponse = Invoke-MgGraphRequest -Method GET -Uri $userUri -ErrorAction Stop + $users = $userResponse.value + + foreach ($user in $users) { + $userId = $user.id + $upn = $user.userPrincipalName + + if (-not $userId -or -not $upn) { continue } + + $userCount++ + $licenses = @() + $hasCopilot = $false + + foreach ($license in $user.assignedLicenses) { + $skuId = $license.skuId + + # Get friendly SKU name + $skuName = if ($skuLookup.ContainsKey($skuId)) { + $skuLookup[$skuId] + } else { + $skuId # Fallback to GUID if not in lookup + } + + $licenses += $skuName + + # Check if this is a Copilot license (two-tier detection) + $isCopilotSku = $copilotSkuIds -contains $skuId + $isCopilotName = $skuName -like "*Copilot*" + + if ($isCopilotSku -or $isCopilotName) { + $hasCopilot = $true + } + } + + # Store license data keyed by both ID and UPN for flexible lookup + if ($licenses.Count -gt 0) { + $userLicenses[$userId] = $licenses + $userLicenses[$upn] = $licenses + } + + $userHasCopilot[$userId] = $hasCopilot + $userHasCopilot[$upn] = $hasCopilot + + if ($hasCopilot) { + $copilotUserCount++ + } + } + + $userUri = $userResponse.'@odata.nextLink' + } while ($userUri) + + Write-LogHost " Processed $userCount user(s) with license assignments" -ForegroundColor Gray + Write-LogHost " Detected $copilotUserCount user(s) with Copilot licenses" -ForegroundColor Green + Write-LogHost "" + + return @{ + UserLicenses = $userLicenses + UserHasCopilot = $userHasCopilot + } + } + catch { + Write-LogHost "WARNING: Failed to fetch license data: $($_.Exception.Message)" -ForegroundColor Yellow + Write-LogHost " License columns will be empty in export" -ForegroundColor Yellow + Write-LogHost "" + + # Return empty hashtables on failure + return @{ + UserLicenses = @{} + UserHasCopilot = @{} + } + } +} + +function ConvertTo-FlatEntraUsers { + <# + .SYNOPSIS + Flattens Entra user objects into CSV-friendly format. + + .DESCRIPTION + Converts Entra ID user objects with nested properties into flat tabular format. + Filters out non-user accounts (rooms, resources) based on userType validation. + Explodes arrays (proxyAddresses, manager) into individual columns. + + NOTE: This version excludes the 9 granular Entra license columns from the Graph script. + License data is added separately via Get-UserLicenseData() in MAC format (assignedLicenses, hasLicense). + + .PARAMETER Users + Array of user objects from Microsoft Graph API (with 35 properties + manager expansion). + + .OUTPUTS + Array of PSCustomObjects with 37 flattened columns (35 user properties + 5 manager columns, no license columns yet). + #> + param( + [Parameter(Mandatory = $true)] + [array]$Users + ) + + $flattenedUsers = @() + + foreach ($user in $Users) { + # Filter: Only include real user accounts (exclude rooms, resources, shared mailboxes) + # Room and resource mailboxes have specific characteristics: + # - userType is often null or not "Member"/"Guest" + # - They typically lack givenName and surname + # - They often have mail but no userPrincipalName with typical user format + + $userTypeValue = $user.userType + + # Skip if userType is null/empty (likely a room or resource) + if ([string]::IsNullOrWhiteSpace($userTypeValue)) { + continue + } + + # Only include users with userType = "Member" or "Guest" + # Rooms/resources typically have different userType values or null + if ($userTypeValue -ne 'Member' -and $userTypeValue -ne 'Guest') { + continue + } + + # Additional heuristic: Real users typically have either givenName or surname + # Room mailboxes typically have neither (only displayName) + # This is not foolproof but combined with userType check, it's quite reliable + $hasGivenName = -not [string]::IsNullOrWhiteSpace($user.givenName) + $hasSurname = -not [string]::IsNullOrWhiteSpace($user.surname) + + # If user has Member/Guest type but no name components, might be a shared resource + # Allow through if they have at least givenName OR surname OR if account is enabled + # (most room mailboxes are enabled but lack name components) + if (-not $hasGivenName -and -not $hasSurname -and $user.accountEnabled) { + # Additional check: if they have licenses assigned, likely a real user + if (-not $user.assignedLicenses -or $user.assignedLicenses.Count -eq 0) { + # No licenses and no name components - likely a room/resource + continue + } + } + + $flatUser = [ordered]@{} + + # Core Identity Properties (simple strings) + $flatUser['userPrincipalName'] = $user.userPrincipalName + $flatUser['DisplayName'] = $user.displayName + $flatUser['id'] = $user.id + $flatUser['Email'] = $user.mail + $flatUser['givenName'] = $user.givenName + $flatUser['surname'] = $user.surname + + # Job Properties + $flatUser['JobTitle'] = $user.jobTitle + $flatUser['department'] = $user.department + $flatUser['employeeType'] = $user.employeeType + $flatUser['employeeId'] = $user.employeeId + $flatUser['employeeHireDate'] = $user.employeeHireDate + + # Location Properties + $flatUser['officeLocation'] = $user.officeLocation + $flatUser['city'] = $user.city + $flatUser['state'] = $user.state + $flatUser['Country'] = $user.country + $flatUser['postalCode'] = $user.postalCode + $flatUser['companyName'] = $user.companyName + + # Organizational Properties + $flatUser['employeeOrgData_division'] = if ($user.employeeOrgData) { $user.employeeOrgData.division } else { $null } + $flatUser['employeeOrgData_costCenter'] = if ($user.employeeOrgData) { $user.employeeOrgData.costCenter } else { $null } + + # Status Properties + $flatUser['accountEnabled'] = $user.accountEnabled + $flatUser['userType'] = $user.userType + $flatUser['createdDateTime'] = $user.createdDateTime + + # Usage Properties + $flatUser['usageLocation'] = $user.usageLocation + $flatUser['preferredLanguage'] = $user.preferredLanguage + + # Sync Properties + $flatUser['onPremisesSyncEnabled'] = $user.onPremisesSyncEnabled + $flatUser['onPremisesImmutableId'] = $user.onPremisesImmutableId + $flatUser['externalUserState'] = $user.externalUserState + + # Explode proxyAddresses array (Email aliases) + if ($user.proxyAddresses -and $user.proxyAddresses.Count -gt 0) { + $primarySMTP = $user.proxyAddresses | Where-Object { $_ -like 'SMTP:*' } | Select-Object -First 1 + $flatUser['proxyAddresses_Primary'] = if ($primarySMTP) { $primarySMTP -replace '^SMTP:', '' } else { $null } + $flatUser['proxyAddresses_Count'] = $user.proxyAddresses.Count + $flatUser['proxyAddresses_All'] = ($user.proxyAddresses -join '; ') + } + else { + $flatUser['proxyAddresses_Primary'] = $null + $flatUser['proxyAddresses_Count'] = 0 + $flatUser['proxyAddresses_All'] = $null + } + + # Handle manager object separately (flatten to individual columns) + if ($user.manager) { + $flatUser['manager_id'] = $user.manager.id + $flatUser['manager_displayName'] = $user.manager.displayName + $flatUser['manager_userPrincipalName'] = $user.manager.userPrincipalName + $flatUser['manager_mail'] = $user.manager.mail + $flatUser['manager_jobTitle'] = $user.manager.jobTitle + } + else { + $flatUser['manager_id'] = $null + $flatUser['manager_displayName'] = $null + $flatUser['manager_userPrincipalName'] = $null + $flatUser['manager_mail'] = $null + $flatUser['manager_jobTitle'] = $null + } + + # License columns will be added separately by Get-EntraUsersData() + # using Get-UserLicenseData() to provide MAC-format columns: + # - assignedLicenses (semicolon-separated SKU names) + # - hasLicense (Copilot detection boolean) + + # ===================================================================== + # Power BI AI-in-One Dashboard 2701 Template Compatibility Columns + # These alias columns map existing Graph API data to 2701 template column names + # ===================================================================== + $flatUser['ManagerID'] = $flatUser['manager_id'] + $flatUser['BusinessAreaLabel'] = $flatUser['employeeOrgData_division'] + $flatUser['CountryofEmployment'] = $flatUser['Country'] + $flatUser['CompanyCodeLabel'] = $flatUser['companyName'] + $flatUser['CostCentreLabel'] = $flatUser['employeeOrgData_costCenter'] + $flatUser['UserName'] = $flatUser['DisplayName'] + + # Viva Insights-specific columns (not available from Microsoft Graph API) + # These are placeholders for template compatibility - data must come from HR systems + $flatUser['EffectiveDate'] = $null + $flatUser['FunctionType'] = $null + $flatUser['BusinessAreaCode'] = $null + $flatUser['OrgLevel_3Label'] = $null + + # Convert ordered hashtable to PSCustomObject for proper CSV export + $flattenedUsers += [PSCustomObject]$flatUser + } + + return $flattenedUsers +} + +function Get-EntraUsersData { + <#! + .SYNOPSIS + Collects and flattens Entra ID (Azure AD) user directory data and enriches with MAC-format license info. + + .DESCRIPTION + # New naming: Purview_Audit_CombinedUsageActivity[_EntraUsers]_timestamp.xlsx + $baseName = "Purview_Audit_CombinedUsageActivity" + if ($IncludeUserInfo -and -not $UseEOM) { $baseName += "_EntraUsers" } + $excelDescriptor = if ($IncludeUserInfo -and -not $UseEOM) { 'multi-tab workbook (CombinedActivity + EntraUsers)' } else { 'single-tab workbook' } + Write-LogHost "Output File: ${outputDir}${baseName}_.xlsx ($excelDescriptor)" -ForegroundColor White + Filters out non-user principals (rooms/resources) using userType + name heuristics identical to ConvertTo-FlatEntraUsers. + Flattens users via ConvertTo-FlatEntraUsers, then appends two MAC-aligned columns: + • assignedLicenses (semicolon-separated SKU `skuPartNumber` names) + • hasLicense (Copilot license boolean; renamed from UserHasCopilotLicense) + + License enrichment uses existing Get-UserLicenseData() hashtables (UserLicenses, UserHasCopilot). + Only called when -IncludeUserInfo is specified (Graph API mode). + + .OUTPUTS + Array[psobject] of flattened users (30 core columns + 5 manager columns + 2 license columns = 37 total). + + .NOTES + If Graph API call fails, returns empty array with warning. License enrichment silently skips if lookup missing. + #> + param( + [switch]$Quiet + ) + + $entraUsers = @() + try { + if (-not $Quiet) { Write-LogHost "Fetching Entra user directory (35 properties + manager)..." -ForegroundColor Cyan } + + # Properties mirrored from Graph script (excluding license arrays we purposefully omit) + $entraUserSelect = @( + 'userPrincipalName','displayName','id','mail','givenName','surname','jobTitle','department','employeeType','employeeId','employeeHireDate', + 'officeLocation','city','state','country','postalCode','companyName','accountEnabled','userType','createdDateTime','usageLocation', + 'preferredLanguage','onPremisesSyncEnabled','onPremisesImmutableId','externalUserState','employeeOrgData','proxyAddresses' + ) -join ',' + + $baseUri = "https://graph.microsoft.com/v1.0/users?`$select=$entraUserSelect&`$expand=manager&`$top=999" + $nextLink = $baseUri + $rawUsers = @() + $loops = 0 + while ($nextLink) { + $loops++ + $resp = Invoke-GraphRequest -Uri $nextLink -Method GET -ErrorAction Stop + if ($resp.value) { $rawUsers += $resp.value } + $nextLink = $resp.'@odata.nextLink' + if ($loops -gt 2000) { throw "Safety abort: excessive paging (>2000)" } + } + + if (-not $Quiet) { Write-LogHost " Retrieved $($rawUsers.Count) raw user objects" -ForegroundColor Gray } + $flattened = ConvertTo-FlatEntraUsers -Users $rawUsers + if (-not $Quiet) { Write-LogHost " Flattened to $($flattened.Count) user rows (filtered)" -ForegroundColor Gray } + + # License enrichment (MAC-format columns) + $licenseData = $script:LicenseData + foreach ($u in $flattened) { + $upn = $u.userPrincipalName + $assignedNames = $null + $hasCopilot = $false + if ($licenseData) { + # lookup by UPN then id for flexibility + if ($licenseData.UserLicenses.ContainsKey($upn)) { + $assignedNames = ($licenseData.UserLicenses[$upn] -join ';') + } elseif ($licenseData.UserLicenses.ContainsKey($u.id)) { + $assignedNames = ($licenseData.UserLicenses[$u.id] -join ';') + } + if ($licenseData.UserHasCopilot.ContainsKey($upn)) { + $hasCopilot = [bool]$licenseData.UserHasCopilot[$upn] + } elseif ($licenseData.UserHasCopilot.ContainsKey($u.id)) { + $hasCopilot = [bool]$licenseData.UserHasCopilot[$u.id] + } + } + Add-Member -InputObject $u -NotePropertyName 'assignedLicenses' -NotePropertyValue $assignedNames -Force + Add-Member -InputObject $u -NotePropertyName 'HasLicense' -NotePropertyValue $hasCopilot -Force + } + $entraUsers = $flattened + # Validate schema (non-fatal) + try { Test-EntraUsersSchema -Users $entraUsers -Quiet:$Quiet } catch { } + } + catch { + Write-LogHost "WARNING: Failed to collect Entra user directory: $($_.Exception.Message)" -ForegroundColor Yellow + } + return $entraUsers +} + +# ============================================== +# GRAPH API QUERY FUNCTIONS +# ============================================== +# REST-based audit log query functions for Microsoft Graph Security API + +function Test-Is429 { + <# + .SYNOPSIS + Safely detects 429 (Too Many Requests) throttling errors. + + .DESCRIPTION + Provides null-safe detection of 429 throttling responses from Graph API. + Handles PowerShell 7+ variations where .Response property may be null. + + Three-layer fallback strategy: + 1. Check .Response.StatusCode (when Response object exists) + 2. Check .Exception.Response.StatusCode directly (PS7+ pattern) + 3. Parse error message for '429' string (final fallback) + + .PARAMETER Exception + The caught exception object from try/catch block + + .OUTPUTS + $true if 429 throttling detected, $false otherwise + + .EXAMPLE + try { + Invoke-RestMethod -Uri $uri -Headers $headers + } + catch { + if (Test-Is429 -Exception $_) { + Start-Sleep -Seconds 60 + } + } + #> + + param( + [Parameter(Mandatory = $true)] + [System.Management.Automation.ErrorRecord]$Exception + ) + + # Layer 1: Check .Response.StatusCode (traditional method) + if ($Exception.Exception.Response -and $Exception.Exception.Response.StatusCode) { + if ($Exception.Exception.Response.StatusCode -eq 429 -or $Exception.Exception.Response.StatusCode -eq 'TooManyRequests') { + return $true + } + } + + # Layer 2: Check .Exception.Response.StatusCode directly (PS7+ sometimes skips wrapper) + if ($Exception.Exception.Response.StatusCode) { + if ($Exception.Exception.Response.StatusCode.value__ -eq 429) { + return $true + } + } + + # Layer 3: Parse error message as final fallback + $errorMessage = $Exception.Exception.Message + if ($errorMessage -match '429' -or $errorMessage -match 'Too Many Requests' -or $errorMessage -match 'TooManyRequests') { + return $true + } + + return $false +} + +function Invoke-GraphAuditQuery { + <# + .SYNOPSIS + Creates a new audit log query in Microsoft Graph Security API. + + .DESCRIPTION + Submits an audit log query request to Microsoft Graph Security API. + Returns a query ID that can be used to poll for status and retrieve results. + + The Graph API uses an asynchronous query model: + 1. Submit query (this function) - returns queryId + 2. Poll query status - wait for "succeeded" state + 3. Retrieve records - paginated results + + .PARAMETER DisplayName + Friendly name for the query (for tracking purposes) + + .PARAMETER StartDate + Start date/time for audit log query (ISO 8601 format) + + .PARAMETER EndDate + End date/time for audit log query (ISO 8601 format) + + .PARAMETER Operations + Array of operation types to query (e.g., 'CopilotInteraction') + + .PARAMETER RecordTypes + Optional record type filters to include in the Graph query body (passthrough). + + .PARAMETER ServiceTypes + Optional service/workload filters to include in the Graph query body (passthrough). + + .OUTPUTS + Query ID string if successful, $null if failed + #> + + param( + [Parameter(Mandatory = $true)] + [string]$DisplayName, + + [Parameter(Mandatory = $true)] + [Alias('FilterStartDateTime')] + [datetime]$StartDate, + + [Parameter(Mandatory = $true)] + [Alias('FilterEndDateTime')] + [datetime]$EndDate, + + [Parameter(Mandatory = $false)] + [Alias('OperationFilters')] + [string[]]$Operations, + + [Parameter(Mandatory = $false)] + [Alias('RecordTypeFilters')] + [string[]]$RecordTypes, + + [Parameter(Mandatory = $false)] + [Alias('ServiceFilter')] + [string[]]$ServiceTypes + ) + + try { + # Format dates to ISO 8601 format required by Graph API + $startDateStr = $StartDate.ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ss.fffZ') + $endDateStr = $EndDate.ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ss.fffZ') + + # Build request body + $body = @{ + displayName = $DisplayName + filterStartDateTime = $startDateStr + filterEndDateTime = $endDateStr + } + + # Fail-safe sanitizer: If operations include M365 usage ops, drop record/service filters + try { + $usageOps = $script:m365UsageActivityBundle + if (-not $usageOps) { $usageOps = $m365UsageActivityBundle } + $hasUsageOps = $false + if ($Operations -and $usageOps) { + $opsLower = @($Operations | ForEach-Object { $_.ToLowerInvariant() }) + $usageLower = @($usageOps | ForEach-Object { $_.ToLowerInvariant() }) + $hasUsageOps = ($opsLower | Where-Object { $usageLower -contains $_ }) | Select-Object -First 1 + } + if ($hasUsageOps) { + $RecordTypes = $null + $ServiceTypes = $null + } + } catch { } + + # Add operation filters if specified + if ($Operations -and $Operations.Count -gt 0) { + $body.operationFilters = @($Operations) + } + + # Add optional record/service filters (passthrough from caller) + if ($RecordTypes -and $RecordTypes.Count -gt 0) { + $body.recordTypeFilters = @($RecordTypes) + } + + if ($ServiceTypes -and $ServiceTypes.Count -gt 0) { + $body.serviceFilter = $ServiceTypes[0] + } + + # Log query details for troubleshooting (persisted to run log) + Write-LogHost "[INFO] Graph API Query Body:" -ForegroundColor Magenta + if ($Operations -and $Operations.Count -gt 0) { + Write-LogHost " operationFilters: $($Operations -join ', ')" -ForegroundColor DarkGray + } + if ($RecordTypes -and $RecordTypes.Count -gt 0) { + Write-LogHost " recordTypeFilters: $($RecordTypes -join ', ')" -ForegroundColor DarkGray + } + if ($ServiceTypes -and $ServiceTypes.Count -gt 0) { + Write-LogHost " serviceFilter: $($ServiceTypes[0])" -ForegroundColor DarkGray + } + $bodyJson = $body | ConvertTo-Json -Depth 10 + Write-LogHost $bodyJson -ForegroundColor DarkGray + + # Submit query via Graph API (auto-detects v1.0 or beta) + $uri = Get-GraphAuditApiUri -Path 'queries' + $response = Invoke-MgGraphRequest -Method POST -Uri $uri -Body $body -ErrorAction Stop + + if ($response -and $response.id) { + return $response.id + } + else { + Write-LogHost "WARNING: Graph API query submitted but no ID returned" -ForegroundColor Yellow + return $null + } + } + catch { + Write-LogHost "ERROR: Failed to submit Graph audit query: $($_.Exception.Message)" -ForegroundColor Red + try { + if ($_.Exception.Response) { + $respStream = $_.Exception.Response.GetResponseStream() + if ($respStream) { + $reader = New-Object System.IO.StreamReader($respStream) + $body = $reader.ReadToEnd() + $reader.Dispose() + if ($body) { Write-LogHost "GRAPH response body: $body" -ForegroundColor DarkGray } + } + } + } catch {} + return $null + } +} + +function Get-GraphAuditQueryStatus { + <# + .SYNOPSIS + Checks the status of a Graph API audit log query. + + .DESCRIPTION + Polls the Microsoft Graph Security API to check query execution status. + + Possible status values: + - notStarted: Query submitted but not yet processing + - queued: Query waiting in backend queue for available execution slot + - running: Query is executing + - succeeded: Query completed successfully, records ready + - failed: Query failed + - cancelled: Query was cancelled + + .PARAMETER QueryId + The query ID returned by Invoke-GraphAuditQuery + + .OUTPUTS + Hashtable with status information: @{ Status='succeeded'; RecordCount=1234 } + Returns $null if query check fails + #> + + param( + [Parameter(Mandatory = $true)] + [string]$QueryId + ) + + try { + $uri = Get-GraphAuditApiUri -Path "queries/$QueryId" + $response = Invoke-MgGraphRequest -Method GET -Uri $uri -ErrorAction Stop + + $result = @{ + QueryId = $QueryId + Status = $response.status + RecordCount = 0 + } + + # Some status responses include record count + if ($response.PSObject.Properties.Name -contains 'recordCount') { + $result.RecordCount = $response.recordCount + } + + return $result + } + catch { + Write-LogHost "ERROR: Failed to get Graph query status: $($_.Exception.Message)" -ForegroundColor Red + return $null + } +} + +function Get-GraphAuditRecords { + <# + .SYNOPSIS + Retrieves audit log records from a completed Graph API query. + + .DESCRIPTION + Fetches audit log records from Microsoft Graph Security API for a completed query. + Handles pagination automatically using @odata.nextLink. + + Only call this function after confirming query status is "succeeded". + + .PARAMETER QueryId + The query ID returned by Invoke-GraphAuditQuery + + .PARAMETER MaxRecords + Maximum number of records to retrieve (default: unlimited) + + .OUTPUTS + Array of audit log record objects, or empty array if none found + #> + + param( + [Parameter(Mandatory = $true)] + [string]$QueryId, + + [Parameter(Mandatory = $false)] + [int]$MaxRecords = 0 + ) + + try { + $allRecords = @() + $uri = Get-GraphAuditApiUri -Path "queries/$QueryId/records" + + do { + $response = Invoke-MgGraphRequest -Method GET -Uri $uri -ErrorAction Stop + + if ($response -and $response.value) { + $allRecords += $response.value + + # Check if we've hit the max records limit + if ($MaxRecords -gt 0 -and $allRecords.Count -ge $MaxRecords) { + $allRecords = $allRecords | Select-Object -First $MaxRecords + break + } + } + + # Check for pagination + $uri = $response.'@odata.nextLink' + + } while ($uri) + + return $allRecords + } + catch { + Write-LogHost "ERROR: Failed to retrieve Graph audit records: $($_.Exception.Message)" -ForegroundColor Red + return @() + } +} + +# ============================================== +# DATA NORMALIZATION FUNCTION +# ============================================== +# Converts Graph API audit records to EOM-compatible schema + +function ConvertFrom-GraphAuditRecord { + <# + .SYNOPSIS + Normalizes Graph API audit records to match EOM cmdlet output schema. + + .DESCRIPTION + Transforms Microsoft Graph Security API audit log records into the same + structure as Search-UnifiedAuditLog cmdlet output. This ensures the + existing explosion logic works identically regardless of data source. + + Graph API Schema → EOM Schema Mapping: + - auditLogRecordType → RecordType + - operation → Operations + - createdDateTime → CreationDate + - auditData → AuditData (JSON string) + - userPrincipalName → UserIds + - id → Identity (unique record identifier) + + .PARAMETER GraphRecords + Array of audit log records from Graph API (Get-GraphAuditRecords output) + + .OUTPUTS + Array of normalized records matching EOM schema structure + #> + + param( + [Parameter(Mandatory = $true)] + [AllowEmptyCollection()] + [array]$GraphRecords + ) + + if (-not $GraphRecords -or $GraphRecords.Count -eq 0) { + return @() + } + + $normalized = @() + + foreach ($record in $GraphRecords) { + try { + # Create EOM-compatible object structure + $eomRecord = [PSCustomObject]@{ + RecordType = $null + CreationDate = $null + UserIds = $null + Operations = $null + AuditData = '{}' + } + # Map: auditLogRecordType → RecordType + if ($record.PSObject.Properties.Name -contains 'auditLogRecordType') { + $eomRecord.RecordType = $record.auditLogRecordType + } + + # Map: createdDateTime → CreationDate + if ($record.PSObject.Properties.Name -contains 'createdDateTime') { + try { + $eomRecord.CreationDate = script:Parse-DateSafe $record.createdDateTime + } + catch { + $eomRecord.CreationDate = $record.createdDateTime + } + } + + # Map: userPrincipalName → UserIds + if ($record.PSObject.Properties.Name -contains 'userPrincipalName') { + $eomRecord.UserIds = $record.userPrincipalName + } + + # Map: operation → Operations + if ($record.PSObject.Properties.Name -contains 'operation') { + $eomRecord.Operations = $record.operation + } + + # Map: id → Identity (unique identifier) + if ($record.PSObject.Properties.Name -contains 'id') { + $eomRecord.Identity = $record.id + } + + # Map: auditData → AuditData (must be JSON string for explosion logic) + # PERF: Also store _ParsedAuditData to avoid re-parsing during explosion + if ($record.PSObject.Properties.Name -contains 'auditData') { + $auditDataObj = $record.auditData + + # Store the already-parsed object for explosion optimization + $eomRecord | Add-Member -NotePropertyName '_ParsedAuditData' -NotePropertyValue $auditDataObj -Force + + # If auditData is already an object, convert to JSON string + if ($auditDataObj -is [string]) { + $eomRecord.AuditData = $auditDataObj + # String means it wasn't pre-parsed, clear _ParsedAuditData + $eomRecord._ParsedAuditData = $null + } + else { + # Convert object to JSON string (explosion logic expects string) + try { + $eomRecord.AuditData = ($auditDataObj | ConvertTo-Json -Depth 100 -Compress) + } + catch { + Write-LogHost "WARNING: Failed to serialize auditData for record $($eomRecord.Identity)" -ForegroundColor Yellow + $eomRecord.AuditData = '{}' + $eomRecord._ParsedAuditData = $null + } + } + } + else { + # No auditData present - create minimal valid JSON + $eomRecord.AuditData = '{}' + } + + $normalized += $eomRecord + } + catch { + Write-LogHost "WARNING: Failed to normalize Graph record: $($_.Exception.Message)" -ForegroundColor Yellow + # Continue processing remaining records + } + } + + return $normalized +} + +# +# Core live-mode functions providing connectivity and paged audit retrieval. +# NOTE: This function is now wrapped by Connect-PurviewAudit for EOM mode compatibility + +function Connect-ToComplianceCenter { + param() + if ($script:Connected) { return } + Write-LogHost "Connecting to Microsoft 365 Security & Compliance Center..." -ForegroundColor Cyan + # Ensure ExchangeOnlineManagement module is available + try { + $existingEOM = Get-Module -ListAvailable -Name ExchangeOnlineManagement | Sort-Object Version -Descending | Select-Object -First 1 + if (-not $existingEOM) { + Write-LogHost "Installing ExchangeOnlineManagement module (CurrentUser scope)..." -ForegroundColor Yellow + try { Install-Module -Name ExchangeOnlineManagement -Scope CurrentUser -Force -AllowClobber -ErrorAction Stop } catch { Write-LogHost "Failed to install module: $($_.Exception.Message)" -ForegroundColor Red; throw } + } + Import-Module ExchangeOnlineManagement -Force -ErrorAction Stop + } catch { + Write-LogHost "Module load/install failure: $($_.Exception.Message)" -ForegroundColor Red + throw + } + + # Authentication modes (subset retained for stability) + try { + switch ($Auth.ToLower()) { + 'weblogin' { + try { + $exoCmd = Get-Command Connect-ExchangeOnline -ErrorAction Stop + $hasUseWeb = $exoCmd.Parameters.ContainsKey('UseWebLogin') + if ($hasUseWeb) { + Write-LogHost 'Using Connect-ExchangeOnline -UseWebLogin (parameter present).' -ForegroundColor DarkGray + Connect-ExchangeOnline -ShowBanner:$false -UseWebLogin -ErrorAction Stop | Out-Null + } + else { + Write-LogHost 'UseWebLogin parameter not available in this host/module; invoking standard interactive Connect-ExchangeOnline.' -ForegroundColor Yellow + Connect-ExchangeOnline -ShowBanner:$false -ErrorAction Stop | Out-Null + } + } + catch { Write-LogHost "WebLogin flow failed: $($_.Exception.Message)" -ForegroundColor Red; throw } + } + 'devicecode' { + Connect-ExchangeOnline -ShowBanner:$false -Device | Out-Null + } + 'credential' { + $cred = Get-Credential -Message 'Enter admin credentials for Exchange Online' + Connect-ExchangeOnline -ShowBanner:$false -Credential $cred | Out-Null + } + default { + # Silent first, fallback to WebLogin + $silentOk = $true + try { Connect-ExchangeOnline -ShowBanner:$false -ErrorAction Stop | Out-Null } catch { $silentOk = $false } + if (-not $silentOk) { + try { Connect-ExchangeOnline -ShowBanner:$false -UseWebLogin -ErrorAction Stop | Out-Null } catch { Write-LogHost "Silent + fallback auth failed: $($_.Exception.Message)" -ForegroundColor Red; throw } + } + } + } + $script:Connected = $true + Write-LogHost "Connected successfully." -ForegroundColor Green + } + catch { + Write-LogHost "Connection failure: $($_.Exception.Message)" -ForegroundColor Red + throw + } +} + +# ============================================== +# DUAL-MODE DIAGNOSTICS FUNCTION +# ============================================== +# Unified capability check for both EOM and Graph API modes + +function Test-PurviewAuditCapability { + <# + .SYNOPSIS + Tests audit log query capability for either EOM or Graph API mode. + + .DESCRIPTION + Performs connectivity and permission checks based on active mode. + + EOM Mode: + - Verifies Search-UnifiedAuditLog cmdlet availability + - Performs probe query to test permissions + - Checks for proper role assignments + + Graph API Mode: + - Tests Graph API connectivity + - Verifies AuditLog.Read.All permissions + - Performs lightweight endpoint check + + .PARAMETER UseEOMMode + If true, test EOM capabilities. If false, test Graph API. + + .PARAMETER SkipChecks + If true, skip all diagnostic checks (for advanced scenarios) + + .OUTPUTS + $true if capability check passes, $false otherwise + #> + + param( + [Parameter(Mandatory = $false)] + [bool]$UseEOMMode = $false, + + [Parameter(Mandatory = $false)] + [bool]$SkipChecks = $false + ) + + if ($SkipChecks) { + Write-LogHost "Diagnostics: Skipped (per user request)" -ForegroundColor Gray + return $true + } + + if ($UseEOMMode) { + # ======================================== + # EOM MODE DIAGNOSTICS + # ======================================== + + Write-LogHost "Running EOM capability diagnostics..." -ForegroundColor Cyan + + # Check if cmdlet is available + $cmd = Get-Command Search-UnifiedAuditLog -ErrorAction SilentlyContinue + if (-not $cmd) { + Write-LogHost " ✗ DIAGNOSTIC FAILED: 'Search-UnifiedAuditLog' cmdlet not found" -ForegroundColor Red + Write-LogHost "" + Write-LogHost "Troubleshooting:" -ForegroundColor Yellow + Write-LogHost " 1. Ensure ExchangeOnlineManagement module v3+ is installed" -ForegroundColor White + Write-LogHost " 2. Try: Install-Module ExchangeOnlineManagement -Scope CurrentUser" -ForegroundColor White + Write-LogHost " 3. Verify authentication completed successfully" -ForegroundColor White + Write-LogHost "" + Write-LogHost "Role Requirements:" -ForegroundColor Yellow + Write-LogHost " • View-Only Audit Logs role (minimum)" -ForegroundColor White + Write-LogHost " • Compliance Management role group" -ForegroundColor White + Write-LogHost " • Organization Management role group" -ForegroundColor White + return $false + } + + # Perform probe query to test permissions + try { + $now = (Get-Date).ToUniversalTime() + $probeStart = $now.AddMinutes(-7) + $probeEnd = $now.AddMinutes(-6) + + # Lightweight probe with unlikely operation + $null = Search-UnifiedAuditLog -StartDate $probeStart -EndDate $probeEnd -Operations 'UserLoggedIn' -ResultSize 1 -ErrorAction Stop + + Write-LogHost " EOM capability check passed" -ForegroundColor Green + return $true + } + catch { + $msg = $_.Exception.Message + Write-LogHost " ✗ DIAGNOSTIC FAILED: Probe query failed" -ForegroundColor Red + Write-LogHost " Error: $msg" -ForegroundColor Yellow + Write-LogHost "" + + if ($msg -match 'is not within the current user|Access denied|not authorized|insufficient') { + Write-LogHost "Likely Cause: Missing required roles" -ForegroundColor Yellow + Write-LogHost " Add account to 'Audit Logs' role group in Microsoft Purview" -ForegroundColor White + } + elseif ($msg -match 'The term .*Search-UnifiedAuditLog.* is not recognized') { + Write-LogHost "Likely Cause: Module not loaded properly" -ForegroundColor Yellow + Write-LogHost " Try: Import-Module ExchangeOnlineManagement -Force" -ForegroundColor White + } + else { + Write-LogHost "General Guidance:" -ForegroundColor Yellow + Write-LogHost " 1. Ensure Unified Audit Log is enabled tenant-wide" -ForegroundColor White + Write-LogHost " 2. Verify role assignments are properly configured" -ForegroundColor White + Write-LogHost " 3. Check for conditional access policies blocking access" -ForegroundColor White + } + + return $false + } + } + else { + # ======================================== + # GRAPH API MODE DIAGNOSTICS + # ======================================== + + Write-LogHost "Running Graph API capability diagnostics..." -ForegroundColor Cyan + + # Verify connected to Graph + try { + $context = Get-MgContext -ErrorAction Stop + + if (-not $context) { + Write-LogHost " ✗ DIAGNOSTIC FAILED: Not connected to Microsoft Graph" -ForegroundColor Red + Write-LogHost " Run Connect-PurviewAudit first to establish connection" -ForegroundColor Yellow + return $false + } + + # Check for required scopes + $requiredScope = 'AuditLog.Read.All' + if ($context.Scopes -notcontains $requiredScope) { + Write-LogHost " [!] WARNING: Missing required scope: $requiredScope" -ForegroundColor Yellow + Write-LogHost " Queries may fail without this permission" -ForegroundColor Yellow + } + } + catch { + Write-LogHost " ✗ DIAGNOSTIC FAILED: Unable to get Graph context" -ForegroundColor Red + Write-LogHost " Error: $($_.Exception.Message)" -ForegroundColor Yellow + return $false + } + + # Test Graph API endpoint connectivity + try { + # Test actual query endpoint with minimal test query + $testQueryBody = @{ + displayName = "PAX-Diagnostic-Test-$(Get-Date -Format 'HHmmss')" + filterStartDateTime = (Get-Date).AddMinutes(-1).ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ss.fffZ') + filterEndDateTime = (Get-Date).ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ss.fffZ') + operationFilters = @('UserLoggedIn') # Common activity type for quick test + } + + $createUri = Get-GraphAuditApiUri -Path 'queries' + $createResponse = Invoke-MgGraphRequest -Method POST -Uri $createUri -Body $testQueryBody -ErrorAction Stop if ($createResponse.id) { + Write-LogHost " Graph API capability check passed" -ForegroundColor Green + Write-LogHost " Successfully created test query (ID: $($createResponse.id))" -ForegroundColor Green + return $true + } + else { + Write-LogHost " ✗ DIAGNOSTIC WARNING: Query created but no ID returned" -ForegroundColor Yellow + return $false + } + } + catch { + $msg = $_.Exception.Message + + # Check if this is a throttling error (429 TooManyRequests) + $isThrottling = $msg -match 'TooManyRequests|429|Too many requests|throttl' + + if ($isThrottling) { + # Set flag so we don't show scary warning message later + $script:ThrottlingDetected = $true + + # Throttling detected - friendly terminal message, full details to log only + Write-Host "" + Write-Host "============================================================================================================" -ForegroundColor DarkYellow + Write-Host " [!] Graph API Throttling Detected (429 - Too Many Requests)" -ForegroundColor DarkYellow + Write-Host "============================================================================================================" -ForegroundColor DarkYellow + Write-Host "" + Write-Host " Microsoft Graph is currently rate-limiting requests to your tenant." -ForegroundColor White + Write-Host "" + Write-Host " How PAX handles throttling:" -ForegroundColor Cyan + Write-Host " • Automatic exponential backoff with retries" -ForegroundColor Gray + Write-Host " • Circuit breaker protection (pauses after repeated failures)" -ForegroundColor Gray + Write-Host " • Adaptive concurrency (reduces parallel requests)" -ForegroundColor Gray + Write-Host " • Real-time notifications (you'll see throttle events as they occur)" -ForegroundColor Gray + Write-Host "" + Write-Host " Recommendation: Graph API throttling typically clears within 5-10 minutes." -ForegroundColor Yellow + Write-Host "" # Interactive prompt (unless -Force is used for headless runs) + if (-not $Force) { + Write-Host " Options:" -ForegroundColor Cyan + Write-Host " [C] CONTINUE - Proceed with automatic throttling handling (may be slow)" -ForegroundColor Green + Write-Host " [E] EXIT - Stop gracefully and retry later (recommended if heavily throttled)" -ForegroundColor Red + Write-Host "" + + Send-PromptNotification + $choice = Read-Host " Enter your choice [C/E]" if ($choice -match '^E$|^Exit$') { + Write-Host "" + Write-Host " Exiting gracefully..." -ForegroundColor Yellow + Write-Host " Disconnecting from Microsoft Graph..." -ForegroundColor Gray + + try { + Disconnect-MgGraph -ErrorAction SilentlyContinue | Out-Null + Write-Host " Disconnected successfully" -ForegroundColor Green + } + catch { + Write-Host " (Graph connection cleanup completed)" -ForegroundColor Gray + } + + Write-Host "" + Write-Host " Please wait 5-10 minutes before retrying." -ForegroundColor Cyan + Write-Host "" + + # Log the graceful exit + Write-Output "[DIAGNOSTIC] User chose to exit due to throttling. Will retry later." | Out-File -FilePath $LogFile -Append -Encoding utf8 + + exit 0 + } + else { + Write-Host "" + Write-Host " Proceeding with automatic throttling handling..." -ForegroundColor Green + Write-Host " Expect slower execution times while Graph API recovers." -ForegroundColor Gray + Write-Host "" + } + } + else { + # -Force flag present (headless/automation mode) - proceed automatically + Write-Host " -Force flag detected: Proceeding automatically with throttling handling..." -ForegroundColor Green + Write-Host " Expect slower execution times while Graph API recovers." -ForegroundColor Gray + Write-Host "" + } + + # Log full error details to log file only (not terminal) + Write-Output "[DIAGNOSTIC] Graph API throttling detected during capability check" | Out-File -FilePath $LogFile -Append -Encoding utf8 + Write-Output "[DIAGNOSTIC] Full error details: $msg" | Out-File -FilePath $LogFile -Append -Encoding utf8 + Write-Output "[DIAGNOSTIC] Continuing with automatic throttling handling enabled" | Out-File -FilePath $LogFile -Append -Encoding utf8 + } + else { + # Non-throttling error - show full details + Write-LogHost " ✗ DIAGNOSTIC FAILED: Graph API endpoint test failed" -ForegroundColor Red + Write-LogHost " Error: $msg" -ForegroundColor Yellow + Write-LogHost "" + + if ($msg -match 'Forbidden|403|Access.*denied|Insufficient privileges') { + Write-LogHost "Likely Cause: Missing required permissions" -ForegroundColor Yellow + Write-LogHost " Required: AuditLog.Read.All Graph API scope" -ForegroundColor White + Write-LogHost " Required: Azure AD role (Compliance/Security Administrator)" -ForegroundColor White + } + elseif ($msg -match 'Unauthorized|401') { + Write-LogHost "Likely Cause: Authentication issue" -ForegroundColor Yellow + Write-LogHost " Try disconnecting and reconnecting: Disconnect-MgGraph; Connect-PurviewAudit" -ForegroundColor White + } + else { + Write-LogHost "General Guidance:" -ForegroundColor Yellow + Write-LogHost " 1. Verify admin has consented to AuditLog.Read.All scope" -ForegroundColor White + Write-LogHost " 2. Check Azure AD role assignments" -ForegroundColor White + Write-LogHost " 3. Ensure network connectivity to graph.microsoft.com" -ForegroundColor White + } + } + + return $false + } + } +} + +# ============================================== +# DUAL-MODE GROUP EXPANSION FUNCTION +# ============================================== +# Expand distribution/security groups to individual user principal names + +function Expand-GroupToUsers { + <# + .SYNOPSIS + Expands a distribution or security group to individual user principal names. + + .DESCRIPTION + Retrieves members of a group using either EOM cmdlets or Graph API. + + EOM Mode: + - Uses Get-DistributionGroupMember cmdlet + - Accepts group display name or email address + - Returns PrimarySmtpAddress of members + + Graph API Mode: + - Uses Get-MgGroupMember cmdlet + - Requires group ObjectId (auto-resolved from display name) + - Returns userPrincipalName of user members + + .PARAMETER GroupIdentity + The group identifier. Can be: + - Display name (e.g., "Executive Leadership") + - Email address (e.g., "exec-team@contoso.com") + - ObjectId/GUID (Graph mode only) + + .PARAMETER UseEOMMode + If true, use EOM cmdlets. If false, use Graph API. + + .OUTPUTS + Array of user principal names (email addresses) + #> + + param( + [Parameter(Mandatory = $true)] + [string]$GroupIdentity, + + [Parameter(Mandatory = $false)] + [bool]$UseEOMMode = $false + ) + + $members = @() + + if ($UseEOMMode) { + # ======================================== + # EOM MODE: Get-DistributionGroupMember + # ======================================== + + try { + Write-LogHost " Processing group (EOM): '$GroupIdentity'" -ForegroundColor Gray + + # Get-DistributionGroupMember works with display name or email + $groupMembers = Get-DistributionGroupMember -Identity $GroupIdentity -ErrorAction Stop + + $members = $groupMembers | Select-Object -ExpandProperty PrimarySmtpAddress + + Write-LogHost " Expanded: $($members.Count) member(s)" -ForegroundColor DarkGray + } + catch { + Write-LogHost " Warning: Failed to expand group '$GroupIdentity': $($_.Exception.Message)" -ForegroundColor Yellow + Write-LogHost " Possible causes:" -ForegroundColor Yellow + Write-LogHost " • Group does not exist or name is misspelled" -ForegroundColor Gray + Write-LogHost " • Insufficient permissions (need Organization Management or similar)" -ForegroundColor Gray + Write-LogHost " • Group is not a distribution/mail-enabled group" -ForegroundColor Gray + } + } + else { + # ======================================== + # GRAPH API MODE: Get-MgGroupMember + # ======================================== + + try { + Write-LogHost " Processing group (Graph API): '$GroupIdentity'" -ForegroundColor Gray + + # Determine if we have an ObjectId (GUID) or display name + $groupId = $null + if ($GroupIdentity -match '^[0-9a-fA-F]{8}-([0-9a-fA-F]{4}-){3}[0-9a-fA-F]{12}$') { + # Looks like a GUID, use directly + $groupId = $GroupIdentity + } + else { + # Display name or email - need to resolve to ObjectId + Write-LogHost " Resolving group ID from display name..." -ForegroundColor DarkGray + + # Try searching by display name first + $groupSearch = Get-MgGroup -Filter "displayName eq '$GroupIdentity'" -ErrorAction SilentlyContinue + + if (-not $groupSearch) { + # Try by mail/mailNickname + $groupSearch = Get-MgGroup -Filter "mail eq '$GroupIdentity'" -ErrorAction SilentlyContinue + } + + if ($groupSearch) { + $groupId = $groupSearch.Id + Write-LogHost " Resolved to ObjectId: $groupId" -ForegroundColor DarkGray + } + else { + throw "Unable to find group with identifier: $GroupIdentity" + } + } + + # Get group members (users only) + $groupMembers = Get-MgGroupMember -GroupId $groupId -All -ErrorAction Stop + + # Filter to users only and extract UPN + foreach ($member in $groupMembers) { + # Check if member is a user (not a nested group or service principal) + if ($member.AdditionalProperties.'@odata.type' -eq '#microsoft.graph.user') { + # Get full user object to retrieve userPrincipalName + $user = Get-MgUser -UserId $member.Id -ErrorAction SilentlyContinue + if ($user -and $user.UserPrincipalName) { + $members += $user.UserPrincipalName + } + } + } + + Write-LogHost " Expanded: $($members.Count) user member(s)" -ForegroundColor DarkGray + } + catch { + Write-LogHost " Warning: Failed to expand group '$GroupIdentity': $($_.Exception.Message)" -ForegroundColor Yellow + Write-LogHost " Possible causes:" -ForegroundColor Yellow + Write-LogHost " • Group does not exist or identifier is invalid" -ForegroundColor Gray + Write-LogHost " • Insufficient permissions (need Group.Read.All or Directory.Read.All)" -ForegroundColor Gray + Write-LogHost " • Network connectivity issues with Graph API" -ForegroundColor Gray + } + } + + return $members +} + +# ============================================== +# DUAL-MODE QUERY EXECUTION WRAPPER +# ============================================== +# Unified query function that routes to either EOM or Graph API + +function Invoke-PurviewAuditQuery { + <# + .SYNOPSIS + Executes an audit log query using either EOM or Graph API. + + .DESCRIPTION + Routes audit log queries to the appropriate backend: + - EOM Mode: Uses Search-UnifiedAuditLog cmdlet + - Graph API Mode: Uses async query pattern (create → poll → retrieve) + + Returns audit records in a normalized format compatible with downstream processing. + + .PARAMETER StartDate + Query start date (inclusive) + + .PARAMETER EndDate + Query end date (exclusive) + + .PARAMETER Operations + Activity type(s) to query + + .PARAMETER UserIds + Optional array of user principal names to filter by + + .PARAMETER ResultSize + Maximum number of records to retrieve (EOM mode) + + .PARAMETER UseEOMMode + If true, use EOM cmdlets. If false, use Graph API. + + .OUTPUTS + Array of audit log records in normalized schema + #> + + param( + [Parameter(Mandatory = $true)] + [datetime]$StartDate, + + [Parameter(Mandatory = $true)] + [datetime]$EndDate, + + [Parameter(Mandatory = $true)] + [string]$Operations, + + [Parameter(Mandatory = $false)] + [string[]]$UserIds, + + [Parameter(Mandatory = $false)] + [int]$ResultSize = 5000, + + [Parameter(Mandatory = $false)] + [bool]$UseEOMMode = $false + ) + + if ($UseEOMMode) { + # ======================================== + # EOM MODE: Use existing Search-UnifiedAuditLog logic + # ======================================== + + # Call existing retry wrapper (preserves all the sophisticated logic) + $results = Invoke-SearchUnifiedAuditLogWithRetry ` + -Start $StartDate ` + -End $EndDate ` + -Operation $Operations ` + -ResultSize $ResultSize ` + -UserIds $UserIds ` + -AutoSubdivide $true + + return $results + } + else { + # ======================================== + # GRAPH API MODE: Async query pattern + # ======================================== + + try { + # Step 1: Create async query + Write-Host " [Graph API] Creating async query for $Operations..." -ForegroundColor DarkGray + + # Use last included minute (EndDate - 1 minute) since end date is exclusive + $endDisplay = $EndDate.AddMinutes(-1) + $displayName = "PAX_Query_$($StartDate.ToString('yyyyMMdd_HHmm'))-$($endDisplay.ToString('yyyyMMdd_HHmm'))" + + # M365 usage mode requires operationFilters only (no recordType/service filters) + $recordTypesArg = $null + $serviceFilterArg = $null + if (-not $script:IncludeM365Usage) { + # Only populate filters when NOT in M365 usage mode + $recordTypesArg = $RecordTypes + $serviceFilterArg = $script:CurrentServiceFilter + if (-not $serviceFilterArg -and $ServiceTypes -and $ServiceTypes.Count -gt 0) { + $serviceFilterArg = $ServiceTypes[0] + } + } + + $queryId = Invoke-GraphAuditQuery ` + -DisplayName $displayName ` + -FilterStartDateTime $StartDate ` + -FilterEndDateTime $EndDate ` + -OperationFilters @($Operations) ` + -RecordTypeFilters $recordTypesArg ` + -ServiceFilter $serviceFilterArg + + if (-not $queryId) { + Write-Host " [Graph API] Failed to create query" -ForegroundColor Red + return @() + } + + Write-Host " [Graph API] Query created: $queryId" -ForegroundColor DarkGray + + # Step 2: Poll for completion + # Replaced fixed-count polling with time-budget model supporting extended outages (up to 30 minutes) + $effectiveOutageMinutes = if ($MaxNetworkOutageMinutes -and $MaxNetworkOutageMinutes -gt 0) { $MaxNetworkOutageMinutes } else { 30 } + $maxPollDurationSeconds = $effectiveOutageMinutes * 60 # Absolute cap for network outage tolerance + $pollInterval = 5 # Base interval (seconds) when healthy + $maxHealthyInterval = 15 # Cap interval when status retrieval succeeds + $pollStart = Get-Date + $pollCount = 0 + $queryComplete = $false + $networkErrorStreak = 0 + $networkOutageStart = $null + $lastNetMessage = $null # Throttle repetitive network messages + + Write-Host " [Graph API] Polling for query completion..." -ForegroundColor DarkGray + + # Transient network resilience variables + $transientPatterns = @('timed out','unable to connect','connection','remote name could not be resolved','temporarily unavailable') + + while (-not $queryComplete) { + # Network outage guard: only abort if we've been in a CONTINUOUS network outage + # exceeding the user's MaxNetworkOutageMinutes threshold. Normal polling + # continues indefinitely until the query succeeds, fails, or the user cancels. + if ($networkOutageStart) { + $outageElapsed = (Get-Date) - $networkOutageStart + if ($outageElapsed.TotalSeconds -ge $maxPollDurationSeconds) { + Write-Host " [NET] Polling aborted after $effectiveOutageMinutes minutes of continuous network outage" -ForegroundColor Red + break + } + } + Start-Sleep -Seconds $pollInterval + $pollCount++ + + $status = $null + try { + $status = Get-GraphAuditQueryStatus -QueryId $queryId -ErrorAction Stop + # Successful status retrieval resets outage tracking + $networkErrorStreak = 0 + if ($networkOutageStart) { + $outageDuration = (Get-Date) - $networkOutageStart + # Only log recovery if outage lasted > 1 minute (ignore brief connection blips) + if ($outageDuration.TotalMinutes -ge 1) { + Write-Host " [NET] Network recovered after $([Math]::Round($outageDuration.TotalMinutes,1)) minutes" -ForegroundColor Green + } + $networkOutageStart = $null + $lastNetMessage = $null + } + # Query status retrieved successfully; interval adjusted + # Tighten interval gradually back to healthy baseline + $pollInterval = [Math]::Max(5, [Math]::Min($pollInterval - 2, $maxHealthyInterval)) + } + catch { + $errMsg = $_.Exception.Message + if ($transientPatterns | Where-Object { $errMsg.ToLower().Contains($_) }) { + $networkErrorStreak++ + if (-not $networkOutageStart) { $networkOutageStart = Get-Date } + $outageElapsed = (Get-Date) - $networkOutageStart + # Throttle messages: only show if outage > 1 min OR first error with no recent message + if ($outageElapsed.TotalMinutes -ge 1 -or ($networkErrorStreak -eq 1 -and (-not $lastNetMessage -or ((Get-Date) - $lastNetMessage).TotalSeconds -ge 60))) { + if (-not $lastNetMessage -or ((Get-Date) - $lastNetMessage).TotalSeconds -ge 60) { + Write-Host " [NET] Poll $pollCount`: transient network issue (streak $networkErrorStreak, outage $([Math]::Round($outageElapsed.TotalMinutes,1))m)" -ForegroundColor Yellow + $lastNetMessage = Get-Date + } + } + # Dynamic backoff growth with ceiling (to avoid hammering during outage) + $pollInterval = [Math]::Min(90, [Math]::Round($pollInterval * 1.6 + (Get-Random -Minimum 2 -Maximum 6))) + continue + } else { + Write-Host " [Graph API] Non-transient status error: $errMsg" -ForegroundColor Red + break + } + } + + if (-not $status) { continue } + + Write-Host " [Graph API] Poll $pollCount`: Status=$($status.Status), RecordCount=$($status.RecordCount)" -ForegroundColor DarkGray + + switch ($status.Status) { + 'succeeded' { + $queryComplete = $true + Write-Host " [Graph API] Query completed: $($status.RecordCount) records available" -ForegroundColor Green + break + } + 'failed' { + Write-Host " [Graph API] Query failed" -ForegroundColor Red + return @() + } + 'cancelled' { + Write-Host " [Graph API] Query was cancelled" -ForegroundColor Yellow + return @() + } + default { continue } + } + } + + if (-not $queryComplete) { + Write-Host " [Graph API] Query polling aborted (network outage or non-transient error after $pollCount polls)" -ForegroundColor Yellow + return @() + } + + # Step 3: Retrieve records + Write-Host " [Graph API] Retrieving records..." -ForegroundColor DarkGray + + # Retrieve records with transient retry resilience + $graphRecords = $null + $recordStart = Get-Date + $recordAttempt = 0 + $maxRecordDurationSeconds = $effectiveOutageMinutes * 60 + $retrieveInterval = 4 + while (-not $graphRecords) { + $recordAttempt++ + if (((Get-Date) - $recordStart).TotalSeconds -ge $maxRecordDurationSeconds) { + Write-Host " [NET] Record retrieval aborted after $effectiveOutageMinutes minutes of network instability" -ForegroundColor Red + break + } + try { + # Graph API: MaxRecords=0 (unlimited) - 10K limit only applies to EOM mode + $graphRecords = Get-GraphAuditRecords -QueryId $queryId -MaxRecords 0 -ErrorAction Stop + } + catch { + $err = $_.Exception.Message + if ($transientPatterns | Where-Object { $err.ToLower().Contains($_) }) { + Write-Host " [NET] Transient record fetch issue (attempt $recordAttempt, elapsed $([Math]::Round(((Get-Date)-$recordStart).TotalMinutes,2))m): $err" -ForegroundColor Yellow + $retrieveInterval = [Math]::Min(90, [Math]::Round($retrieveInterval * 1.5 + (Get-Random -Minimum 1 -Maximum 5))) + Start-Sleep -Seconds $retrieveInterval + continue + } else { + Write-Host " [Graph API] Non-transient record fetch error: $err" -ForegroundColor Red + break + } + } + } + if (-not $graphRecords) { Write-Host " [NET] Retrieval failed after extended retry window" -ForegroundColor Red } + + if (-not $graphRecords -or $graphRecords.Count -eq 0) { + Write-Host " [Graph API] No records returned" -ForegroundColor Gray + return @() + } + + Write-Host " [Graph API] Retrieved $($graphRecords.Count) records, normalizing..." -ForegroundColor DarkGray + + # Step 4: Normalize to EOM-compatible schema + $normalized = @() + foreach ($record in $graphRecords) { + $normalizedRecord = ConvertFrom-GraphAuditRecord -GraphRecord $record + if ($normalizedRecord) { + $normalized += $normalizedRecord + } + } + + # Filter by UserIds if specified (Graph API doesn't support UPN filtering in query) + if ($UserIds -and $UserIds.Count -gt 0 -and $normalized.Count -gt 0) { + Write-Host " [Graph API] Applying client-side UserIds filter..." -ForegroundColor DarkGray + $beforeFilter = $normalized.Count + $normalized = $normalized | Where-Object { $UserIds -contains $_.UserIds } + Write-Host " [Graph API] Filtered: $beforeFilter → $($normalized.Count) records" -ForegroundColor DarkGray + } + + Write-Host " [Graph API] Normalization complete: $($normalized.Count) records ready" -ForegroundColor Green + + return $normalized + } + catch { + Write-Host " [Graph API] Query error: $($_.Exception.Message)" -ForegroundColor Red + Write-Host " [Graph API] Falling back to empty result set" -ForegroundColor Yellow + return @() + } + } +} + +# ============================================== +# DUAL-MODE DISCONNECTION FUNCTION +# ============================================== +# Unified disconnection for both EOM and Graph API modes + +function Disconnect-PurviewAudit { + <# + .SYNOPSIS + Disconnects from either Exchange Online or Microsoft Graph. + + .DESCRIPTION + Cleanly disconnects active sessions based on mode. + + EOM Mode: + - Calls Disconnect-ExchangeOnline + - No confirmation prompt + + Graph API Mode: + - Calls Disconnect-MgGraph + - Clears Graph context + + .PARAMETER UseEOMMode + If true, disconnect from EOM. If false, disconnect from Graph. + + .OUTPUTS + None + #> + + param( + [Parameter(Mandatory = $false)] + [bool]$UseEOMMode = $false + ) + + if ($UseEOMMode) { + # ======================================== + # EOM MODE: Disconnect-ExchangeOnline + # ======================================== + + try { + Write-LogHost "Disconnecting from Exchange Online..." -ForegroundColor Gray + Disconnect-ExchangeOnline -Confirm:$false -ErrorAction Stop | Out-Null + Write-LogHost " Disconnected from Exchange Online" -ForegroundColor Green + } + catch { + # Silently handle - may not be connected or already disconnected + Write-LogHost " (Exchange Online disconnection skipped or already disconnected)" -ForegroundColor DarkGray + } + } + else { + # ======================================== + # GRAPH API MODE: Disconnect-MgGraph + # ======================================== + + try { + # Check if connected first + $context = Get-MgContext -ErrorAction SilentlyContinue + + if ($context) { + Write-LogHost "Disconnecting from Microsoft Graph..." -ForegroundColor Gray + Disconnect-MgGraph -ErrorAction Stop | Out-Null + Write-LogHost " Disconnected from Microsoft Graph" -ForegroundColor Green + } + else { + Write-LogHost " (Not connected to Microsoft Graph)" -ForegroundColor DarkGray + } + } + catch { + # Silently handle - may not be connected or already disconnected + Write-LogHost " (Microsoft Graph disconnection skipped or already disconnected)" -ForegroundColor DarkGray + } + } +} + +# Pre-query diagnostic: verify Search-UnifiedAuditLog availability & likely permission coverage. +# NOTE: This function is now wrapped by Test-PurviewAuditCapability for EOM mode compatibility +function Invoke-AuditCapabilityDiagnostics { + param() + if ($SkipDiagnostics) { return $true } + $cmd = Get-Command Search-UnifiedAuditLog -ErrorAction SilentlyContinue + if (-not $cmd) { + Write-LogHost "DIAGNOSTIC: 'Search-UnifiedAuditLog' cmdlet not found in this session." -ForegroundColor Red + Write-LogHost "Guidance: Ensure ExchangeOnlineManagement module (v3+) is installed and imported. Try: Install-Module ExchangeOnlineManagement -Scope CurrentUser" -ForegroundColor Yellow + Write-LogHost "Role Requirements: Membership in 'Audit Logs' (preferred) or 'View-Only Audit Logs' / appropriate Compliance role group." -ForegroundColor Yellow + return $false + } + # Attempt a minimal, very narrow harmless probe query (empty expected results) + try { + $now = (Get-Date).ToUniversalTime() + $probeStart = $now.AddMinutes(-7) + $probeEnd = $now.AddMinutes(-6) + # Use an operation that is unlikely to appear but valid syntactically + $null = Search-UnifiedAuditLog -StartDate $probeStart -EndDate $probeEnd -Operations 'UserLoggedIn' -ResultSize 1 -ErrorAction Stop + Write-LogHost "Diagnostics: Audit search cmdlet available (probe succeeded/no error)." -ForegroundColor DarkGray + return $true + } + catch { + $msg = $_.Exception.Message + Write-LogHost "DIAGNOSTIC: Probe audit search failed: $msg" -ForegroundColor Yellow + if ($msg -match 'is not within the current user' -or $msg -match 'Access denied' -or $msg -match 'not authorized' -or $msg -match 'insufficient') { + Write-LogHost "Likely Missing Roles: Add the account to 'Audit Logs' (Microsoft Purview) or at minimum 'View-Only Audit Logs'." -ForegroundColor Red + } + elseif ($msg -match 'The term .*Search-UnifiedAuditLog.* is not recognized') { + Write-LogHost "Module Issue: Cmdlet not loaded. Import-Module ExchangeOnlineManagement or update module version." -ForegroundColor Red + } + else { + Write-LogHost "General Guidance: Ensure Unified Audit Log is enabled tenant-wide & correct role assignments are in place." -ForegroundColor Yellow + } + return $false + } +} + +function Invoke-SearchUnifiedAuditLogWithRetry { + <# + Provides pagination & early 10K detection. + Adjustments: + * Honors $PacingMs but leaves adaptive / circuit breaker to caller. + * Maintains metrics.PagesFetched & global limit flags used by higher layers. + #> + param( + [Parameter(Mandatory)][datetime]$Start, + [Parameter(Mandatory)][datetime]$End, + [Parameter(Mandatory)][string]$Operation, + [Parameter(Mandatory)][int]$ResultSize, + [string[]]$UserIds, + [int]$MaxRetries = 3, + [bool]$AutoSubdivide = $true + ) + + $script:Hit10KLimit = $false + $script:LimitTimeWindow = "" + $allResults = New-Object System.Collections.ArrayList + $totalFetched = 0 + $pageNumber = 1 + $maxPages = 50 + $pageSize = [Math]::Min($ResultSize, 5000) + $useSessionPagination = $ResultSize -gt 5000 + $sessionId = if ($useSessionPagination) { [guid]::NewGuid().ToString() } else { $null } + + Write-LogHost (" Using {0} pagination (page size {1})" -f ($(if ($useSessionPagination){'session'} else {'standard'}), $pageSize)) -ForegroundColor DarkCyan + + try { + while ($totalFetched -lt $ResultSize -and $pageNumber -le $maxPages) { + $remainingNeeded = $ResultSize - $totalFetched + $currentPageSize = [Math]::Min($pageSize, $remainingNeeded) + $attempt = 0; $pageResults = $null + while ($attempt -le $MaxRetries) { + try { + $params = @{ StartDate = $Start; EndDate = $End; Operations = $Operation; ResultSize = $currentPageSize; ErrorAction = 'Stop' } + if ($UserIds) { $params['UserIds'] = $UserIds } + if ($useSessionPagination) { + $params['SessionId'] = $sessionId + $params['SessionCommand'] = if ($pageNumber -eq 1) { 'ReturnLargeSet' } else { 'ReturnNextPreviewPage' } + } + if ($PacingMs -gt 0) { Start-Sleep -Milliseconds $PacingMs } + if ($attempt -gt 0) { Write-LogHost " Retrying page $pageNumber (attempt $($attempt+1))" -ForegroundColor Yellow } + $pageResults = Search-UnifiedAuditLog @params + break + } + catch { + $attempt++ + if ($attempt -le $MaxRetries) { + $delay = [Math]::Min(30, [Math]::Pow(2, $attempt)) + Write-LogHost " Page $pageNumber failed: $($_.Exception.Message). Backoff ${delay}s" -ForegroundColor DarkYellow + Start-Sleep -Seconds $delay + if ($useSessionPagination -and $attempt -gt 1) { $sessionId = [guid]::NewGuid().ToString(); Write-LogHost " New session id for retry: $sessionId" -ForegroundColor DarkGray } + } else { + Write-LogHost " Page $pageNumber permanently failed after $attempt attempts" -ForegroundColor Red + throw + } + } + } + + if ($pageResults -and $pageResults.Count -gt 0) { + # Early 10K detection (first page result count meta) + if ($pageNumber -eq 1 -and $AutoSubdivide) { + try { + $est = $pageResults[0].ResultCount + if ($null -ne $est -and $est -ge 10000) { + Write-LogHost " [!] Estimated >=10K records in window – consider subdivision" -ForegroundColor Yellow + } + } catch {} + } + # Safe add - handle both array and single object returns + if ($pageResults -is [Array]) { + foreach ($item in $pageResults) { [void]$allResults.Add($item) } + } else { + [void]$allResults.Add($pageResults) + } + $totalFetched += $pageResults.Count + # Hard stop enforcement: never return more than requested -ResultSize + if ($totalFetched -ge $ResultSize) { + if ($totalFetched -gt $ResultSize) { + $excess = $totalFetched - $ResultSize + # Trim excess items from tail + for ($trim = 0; $trim -lt $excess; $trim++) { [void]$allResults.RemoveAt($allResults.Count - 1) } + $totalFetched = $ResultSize + } + Write-LogHost " Requested result size $ResultSize reached (cumulative: $totalFetched) – stopping" -ForegroundColor DarkCyan + try { $script:metrics.PagesFetched += 1 } catch {} + break + } + try { $script:metrics.PagesFetched += 1 } catch {} + Write-LogHost " Page $pageNumber returned $($pageResults.Count) (cumulative: $totalFetched)" -ForegroundColor DarkCyan + if ($pageResults.Count -lt $currentPageSize) { break } + if ($totalFetched -ge 10000) { + $script:Hit10KLimit = $true + $script:LimitTimeWindow = "$(($Start).ToString('yyyy-MM-dd HH:mm')) to $(($End).ToString('yyyy-MM-dd HH:mm'))" + # SMART SUBDIVISION for EOM: Analyze timestamp distribution + if ($AutoSubdivide -and $allResults.Count -ge 10000) { + try { + $timestamps = @() + foreach ($rec in $allResults) { + if ($rec.CreationDate) { + $ts = script:Parse-DateSafe $rec.CreationDate; if ($ts) { $timestamps += $ts } + } + } + if ($timestamps.Count -gt 100) { + $sorted = $timestamps | Sort-Object + $coveredHours = ($sorted[-1] - $sorted[0]).TotalHours + $totalHours = ($End - $Start).TotalHours + if ($coveredHours -gt 0 -and $coveredHours -lt $totalHours) { + $recordsPerHour = 10000 / $coveredHours + $targetHours = 8000 / $recordsPerHour + $subdivFactor = [Math]::Max(2, [Math]::Ceiling($totalHours / $targetHours)) + Write-LogHost " [SMART SUBDIVISION] EOM: $([Math]::Round($coveredHours,2))h of $([Math]::Round($totalHours,2))h → suggest dividing by $subdivFactor" -ForegroundColor Cyan + } + } + } catch {} + } + Write-LogHost " 10K server limit reached in this window" -ForegroundColor Yellow + break + } + } else { + Write-LogHost " Page $pageNumber empty – stopping" -ForegroundColor DarkCyan + break + } + $pageNumber++ + } + + if ($pageNumber -gt $maxPages) { + Write-LogHost " Reached max page limit ($maxPages)" -ForegroundColor Yellow + } + Write-LogHost " Pagination complete: $($allResults.Count) records" -ForegroundColor Green + return $allResults.ToArray() + } + catch { + Write-LogHost " Pagination failed: $($_.Exception.Message)" -ForegroundColor Red + throw + } +} + +# Wrapper for main processing (kept minimal for clarity) +function Invoke-PAXProcessingCore { + param() + try { + # Existing core logic already executed above in previous top-level scope. + # This wrapper intentionally left minimal to avoid structural parse issues. + } + catch { + Write-LogHost "Core processing error: $($_.Exception.Message)" -ForegroundColor Red + throw + } +} + +$script:adaptiveThroughputBaseline = $null +$script:adaptiveLowLatencyStreak = 0 +$script:consecutiveBlockFailures = 0 +$script:circuitBreakerOpen = $false +$script:circuitBreakerOpenUntil = $null + +function Get-BackoffDelaySeconds { + param( + [Parameter(Mandatory)][int]$Attempt, + [Parameter(Mandatory)][double]$BaseSeconds, + [Parameter(Mandatory)][int]$MaxSeconds + ) + if ($Attempt -lt 1) { return 0 } + $raw = $BaseSeconds * [math]::Pow(2, ($Attempt - 1)) + return [math]::Min($MaxSeconds, $raw) +} + +function Test-CircuitBreakerTrip { + param( + [Parameter(Mandatory)][int]$ConsecutiveFailures, + [Parameter(Mandatory)][int]$Threshold + ) + return ($ConsecutiveFailures -ge $Threshold) +} + +$JsonDepth = 60 +$FlatDepthStandard = 6 +$FlatDepthDeep = 120 +$ExplosionPerRecordRowCap = 1000 +$script:TenantPrimaryDomain = $null +if (-not $script:TenantId) { $script:TenantId = $null } +$script:TenantIndicators = @() +$ForcedRawInputCsvExplosion = $false + +# Auth config storage for token refresh (AppRegistration mode) +$script:AuthConfig = @{ + Method = $null + TenantId = $null + ClientId = $null + ClientSecret = $null # SecureString + CertThumbprint = $null + CertPath = $null + CertPassword = $null # SecureString + CertStoreLocation = 'CurrentUser' + TokenIssueTime = $null + CanReauthenticate = $false +} + +# Shared auth state for thread job token refresh (synchronized hashtable for cross-thread access) +# Thread jobs read Token from this hashtable; main thread updates it proactively before expiry +$script:SharedAuthState = [hashtable]::Synchronized(@{ + Token = $null + ExpiresOn = $null + LastRefresh = $null + RefreshCount = 0 + AuthMethod = $null +}) + +# Checkpoint/Resume state for long-running operations +$script:CheckpointPath = $null # Path to checkpoint JSON file +$script:CheckpointData = $null # Loaded/active checkpoint object +$script:IsResumeMode = $false # Whether we're resuming from checkpoint +$script:PartialOutputPath = $null # Path to _PARTIAL.csv file during execution +$script:OriginallySkippedPartitionIndices = @() # Partition indices that were already completed before this run (for resume mode) +$script:StreamingMergeDuplicatesSkipped = 0 # Count of duplicate records removed during streaming merge +$script:StreamingMergeDataLoss = $false # Whether streaming merge detected missing partition data + +# Token expiration detection (reactive - triggers on 401 Unauthorized) +$script:TokenAcquiredTime = $null # When current token was obtained +$script:AuthFailureDetected = $false # Set to $true when 401 error detected - triggers reauth prompt +$script:Auth401MessageShown = $false # Suppresses duplicate 401 error messages (reset after successful reauth) +$script:AuthPromptInProgress = $false # Debounce flag - prevents multiple auth prompts from triggering simultaneously + +# PowerShell version detection for parallel processing features +$script:IsPS7 = ($PSVersionTable.PSVersion.Major -ge 7) + +if ($RAWInputCSV) { $ForcedRawInputCsvExplosion = $true } + +$script:RegexTrueFalse = [regex]::new('^(?i:true|false)$', [System.Text.RegularExpressions.RegexOptions]::Compiled) +$script:RegexYes1 = [regex]::new('^(?i:yes|1)$', [System.Text.RegularExpressions.RegexOptions]::Compiled) +$script:RegexNo0 = [regex]::new('^(?i:no|0)$', [System.Text.RegularExpressions.RegexOptions]::Compiled) +$script:LocaleDateParsingNotified = $false + +function script:Parse-DateSafe { + <# + .SYNOPSIS + Culture-invariant date parsing that handles Purview API date formats. + .DESCRIPTION + Purview API returns dates in US format (M/d/yyyy HH:mm:ss) regardless of client locale. + This function safely parses such dates on systems with non-US regional settings (e.g., UK). + #> + param([Parameter(Mandatory=$false)][AllowNull()][AllowEmptyString()]$DateValue) + + # Log once when running under non-US locale + if (-not $script:LocaleDateParsingNotified) { + $script:LocaleDateParsingNotified = $true + $currentCulture = [System.Threading.Thread]::CurrentThread.CurrentCulture.Name + if ($currentCulture -and $currentCulture -ne 'en-US') { + Write-LogHost " [DATE] Locale-aware date parsing active (Culture: $currentCulture)" -ForegroundColor DarkCyan + } + } + + # Already a DateTime? Return as-is + if ($DateValue -is [datetime]) { return $DateValue } + + # Null or empty? Return null + if ([string]::IsNullOrWhiteSpace($DateValue)) { return $null } + + $dateStr = [string]$DateValue + + # Try ISO 8601 formats first (most common from properly-formatted API responses) + $isoFormats = @( + 'yyyy-MM-ddTHH:mm:ss.fffffffK', + 'yyyy-MM-ddTHH:mm:ss.fffK', + 'yyyy-MM-ddTHH:mm:ssK', + 'yyyy-MM-ddTHH:mm:ss.fffffff', + 'yyyy-MM-ddTHH:mm:ss.fffZ', + 'yyyy-MM-ddTHH:mm:ssZ', + 'yyyy-MM-ddTHH:mm:ss.fff', + 'yyyy-MM-ddTHH:mm:ss', + 'yyyy-MM-dd HH:mm:ss.fff', + 'yyyy-MM-dd HH:mm:ss', + 'yyyy-MM-dd' + ) + + foreach ($fmt in $isoFormats) { + try { + return [datetime]::ParseExact($dateStr, $fmt, [System.Globalization.CultureInfo]::InvariantCulture, [System.Globalization.DateTimeStyles]::AdjustToUniversal) + } + catch { } + } + + # Try US formats explicitly (what Purview actually returns - causes UK locale issues) + $usFormats = @( + 'M/d/yyyy HH:mm:ss', + 'M/d/yyyy h:mm:ss tt', + 'M/d/yyyy H:mm:ss', + 'MM/dd/yyyy HH:mm:ss', + 'M/d/yyyy', + 'MM/dd/yyyy' + ) + + foreach ($fmt in $usFormats) { + try { + return [datetime]::ParseExact($dateStr, $fmt, [System.Globalization.CultureInfo]::InvariantCulture) + } + catch { } + } + + # Last resort: use InvariantCulture with Parse + try { + return [datetime]::Parse($dateStr, [System.Globalization.CultureInfo]::InvariantCulture) + } + catch { + return $null + } +} + +function script:Format-DatePurviewFast($dt) { + if (-not $dt) { return '' } + try { + if ($dt -is [datetime]) { + return $dt.ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ss.fffZ') + } + else { + $p = script:Parse-DateSafe $dt + if ($null -eq $p) { return '' } + return $p.ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ss.fffZ') + } + } + catch { return '' } +} + +function script:BoolTFFast($v) { + if ($null -eq $v) { return '' } + if ($v -is [bool]) { return $v.ToString().ToUpper() } + $vStr = [string]$v + if ($script:RegexTrueFalse.IsMatch($vStr)) { return $vStr.ToUpper() } + if ($script:RegexYes1.IsMatch($vStr)) { return 'TRUE' } + if ($script:RegexNo0.IsMatch($vStr)) { return 'FALSE' } + return $vStr +} + +# Apply FlatDepth override (if provided) +try { + if ($PSBoundParameters.ContainsKey('FlatDepth')) { + $FlatDepthDeep = $FlatDepth + $FlatDepthStandard = [int][Math]::Min($FlatDepth, $FlatDepthStandard) + } +} catch {} + +function script:ToJsonIfObjectFast($v) { + if ($null -eq $v) { return '' } + if (Test-ScalarValue $v) { return $v } + try { return ($v | ConvertTo-Json -Depth $JsonDepth -Compress) } + catch { return [string]$v } +} + +function script:GetArrayFast($parent, [string]$name) { + $val = Get-SafeProperty $parent $name + if ($null -eq $val) { return @() } + if ($val -is [System.Collections.IEnumerable] -and -not ($val -is [string])) { + return @($val) + } + return @($val) +} + +$effectiveExplodeForProgress = ($ExplodeDeep -or $ExplodeArrays -or $ForcedRawInputCsvExplosion) + +# MEMORY MANAGEMENT: Resolve MaxMemoryMB (-1 = auto 75% of system RAM, 0 = disabled, >0 = explicit limit) +$script:ResolvedMaxMemoryMB = $MaxMemoryMB +if ($MaxMemoryMB -eq -1) { + # Auto-detect: use 75% of total physical memory + try { + $totalRAM = [math]::Round((Get-CimInstance -ClassName Win32_ComputerSystem -ErrorAction SilentlyContinue).TotalPhysicalMemory / 1MB, 0) + $script:ResolvedMaxMemoryMB = [math]::Round($totalRAM * 0.75, 0) + Write-LogHost "Memory management: Auto-detected ${totalRAM}MB total RAM -> limit set to $($script:ResolvedMaxMemoryMB)MB (75%)" -ForegroundColor Cyan + } catch { + # Fallback if CIM fails (e.g., Linux/macOS) + $script:ResolvedMaxMemoryMB = 4096 + Write-LogHost "Memory management: Could not detect system RAM, defaulting to 4096MB limit" -ForegroundColor Yellow + } +} elseif ($MaxMemoryMB -eq 0) { + $script:ResolvedMaxMemoryMB = 0 + Write-LogHost "Memory management: DISABLED (-MaxMemoryMB 0)" -ForegroundColor DarkGray +} + +# Memory flush mode: enabled when ResolvedMaxMemoryMB > 0 AND explosion is disabled (explosion needs full $allLogs in memory) +$script:memoryFlushEnabled = ($script:ResolvedMaxMemoryMB -gt 0) -and (-not $ExplodeDeep) -and (-not $ExplodeArrays) -and (-not $ForcedRawInputCsvExplosion) +$script:memoryFlushed = $false # Track if we've flushed $allLogs during this run (affects export path) +$enableParallelSwitchUsed = $EnableParallel.IsPresent +if ($enableParallelSwitchUsed) { $ParallelMode = 'On' } + +function Get-ParallelActivationDecision { + param( + [array]$QueryPlan, + [string]$ParallelMode, + [int]$MaxParallelGroups, + [int]$MaxConcurrency + ) + $ps7 = ($PSVersionTable.PSVersion.Major -ge 7) + $totalGroups = $QueryPlan.Count + $totalActivities = ($QueryPlan | ForEach-Object { $_.Activities.Count } | Measure-Object -Sum).Sum + # Auto parallel eligibility heuristic: previously required more than one group, causing single-activity + # multi-partition scenarios (e.g., CopilotInteraction with 3 partitions) to run sequentially. + # Adjust logic: allow auto parallel when there's at least one group AND either >1 group OR + # a single group whose planned concurrency would yield >1 partition. + $singleGroupMultiPartition = ($totalGroups -eq 1) -and ($QueryPlan[0].Concurrency -gt 1) + $autoEligible = $ps7 -and ($MaxParallelGroups -gt 0) -and ($MaxConcurrency -gt 1) -and ($totalGroups -ge 1) -and (($totalGroups -gt 1) -or $singleGroupMultiPartition) + + switch ($ParallelMode) { + 'On' { + return @{ Enabled = ($ps7 -and $MaxParallelGroups -gt 0 -and $MaxConcurrency -gt 0); Reason = if ($ps7) { 'Forced On' } else { 'PS < 7 (cannot parallel)' }; AutoEligible = $autoEligible } + } + 'Auto' { + return @{ Enabled = $autoEligible; Reason = if ($autoEligible) { 'Auto criteria met' } else { 'Auto criteria not met' }; AutoEligible = $autoEligible } + } + default { + return @{ Enabled = $false; Reason = 'Mode Off'; AutoEligible = $autoEligible } + } + } +} + +$weights = if ($effectiveExplodeForProgress) { @{ Query = 0.30; Explosion = 0.60; Export = 0.10 } } else { @{ Query = 0.80; Explosion = 0.00; Export = 0.20 } } +if ($RAWInputCSV) { + try { + $weights = @{ Parsing = 0.10; Query = 0.0; Explosion = 0.80; Export = 0.10 } + } + catch {} +} +$script:originalWeights = $weights.Clone() +$script:progressState = @{ Weights = $weights; Phase = 'Query'; Parsing = @{Current = 0; Total = 0 }; Query = @{Current = 0; Total = 0 }; Explode = @{Current = 0; Total = 0 }; Export = @{Current = 0; Total = 1 } } +function Set-ProgressPhase { param([ValidateSet('Parsing', 'Query', 'Explosion', 'Export', 'Complete')] [string]$Phase, [string]$Status = ''); $script:progressState.Phase = $Phase; Update-Progress -Status $Status } +function Update-Progress { + param( + [string]$Status = '', + [int]$BatchCurrent = 0, + [int]$BatchTotal = 0, + [int]$BatchRangeStart = 0, + [int]$BatchRangeEnd = 0, + [int]$BatchStartPercent = 0, + [int]$BatchEndPercent = 0, + [bool]$BatchTotalIsEstimate = $false + ) + $w = $script:progressState.Weights; $ps = $script:progressState.Parsing; $qs = $script:progressState.Query; $es = $script:progressState.Explode; $xs = $script:progressState.Export + $pPct = if ($ps.Total -gt 0 -and $w.ContainsKey('Parsing') -and $w.Parsing -gt 0) { [double]$ps.Current / [double]$ps.Total } else { 0.0 } + $qPct = if ($qs.Total -gt 0) { [double]$qs.Current / [double]$qs.Total } else { 0.0 } + $ePct = if ($es.Total -gt 0 -and $w.Explosion -gt 0) { [double]$es.Current / [double]$es.Total } else { 0.0 } + $xPct = if ($xs.Total -gt 0) { [double]$xs.Current / [double]$xs.Total } else { 0.0 } + # Zero-record weighting: emphasize Query progression when no records retrieved yet + if ($script:progressState.Phase -eq 'Query' -and ($script:metrics.TotalRecordsFetched -eq 0)) { + $w.Query = 1.0; $w.Explosion = 0.0; $w.Export = 0.0; if ($w.ContainsKey('Parsing')) { $w.Parsing = 0.0 } + } + # Restoration: Once at least one record has been fetched, revert weights if they were temporarily overridden. + elseif ($script:progressState.Phase -eq 'Query' -and ($script:metrics.TotalRecordsFetched -gt 0)) { + if ($script:originalWeights -and $w.Query -eq 1.0 -and $w.Explosion -eq 0.0 -and $w.Export -eq 0.0) { + foreach ($key in $script:originalWeights.Keys) { $w[$key] = $script:originalWeights[$key] } + } + } + # Calculate phase-specific progress details + $phase = $script:progressState.Phase + $pDetail = if ($w.ContainsKey('Parsing') -and $w.Parsing -gt 0 -and $ps.Total -gt 0) { "{0}/{1}({2}%)" -f $ps.Current, $ps.Total, ([int]([Math]::Round($pPct * 100))) } else { '' } + $qDetail = if ($w.Query -gt 0 -and $qs.Total -gt 0) { "{0}/{1}({2}%)" -f $qs.Current, $qs.Total, ([int]([Math]::Round($qPct * 100))) } else { '' } + if ($BatchRangeStart -ge 1 -and $BatchRangeEnd -ge 1 -and $es.Total -gt 0) { + if ($BatchStartPercent -ge 0 -and $BatchEndPercent -gt 0) { + $batchTotalDisplay = if ($BatchTotalIsEstimate) { "~$BatchTotal" } else { "$BatchTotal" } + $batchInfo = if ($BatchTotal -ge 1) { " Batch: {0}/{1}({2}%-{3}%)" -f $BatchCurrent, $batchTotalDisplay, $BatchStartPercent, $BatchEndPercent } else { '' } + } + else { + $batchPct = if ($BatchTotal -gt 0 -and $BatchCurrent -gt 0) { [int]([Math]::Round(([double]$BatchCurrent / [double]$BatchTotal) * 100)) } else { 0 } + $batchTotalDisplay = if ($BatchTotalIsEstimate) { "~$BatchTotal" } else { "$BatchTotal" } + $batchInfo = if ($BatchTotal -ge 1) { " Batch: {0}/{1}({2}%)" -f $BatchCurrent, $batchTotalDisplay, $batchPct } else { '' } + } + $explosionCounts = "Records {0}-{1}/{2}{3}" -f $BatchRangeStart, $BatchRangeEnd, $es.Total, $batchInfo + } + elseif ($BatchTotal -ge 1) { + $batchPct = if ($BatchTotal -gt 0 -and $BatchCurrent -gt 0) { [int]([Math]::Round(([double]$BatchCurrent / [double]$BatchTotal) * 100)) } else { 0 } + $batchTotalDisplay = if ($BatchTotalIsEstimate) { "~$BatchTotal" } else { "$BatchTotal" } + $batchInfo = " Batch: {0}/{1}({2}%)" -f $BatchCurrent, $batchTotalDisplay, $batchPct + $explosionCounts = if ($es.Total -gt 0) { "Records {0}/{1}{2}" -f $es.Current, $es.Total, $batchInfo } else { "0/0" } + } + else { + $explosionCounts = if ($es.Total -gt 0) { "{0}/{1}({2}%)" -f $es.Current, $es.Total, ([int]([Math]::Round($ePct * 100))) } else { '0/0' } + } + $eDetail = if ($w.Explosion -gt 0) { + if ($phase -eq 'Explosion') { + " | $explosionCounts" + } + else { + " | Explosion: $explosionCounts" + } + } + else { '' } + $batchDetail = '' + $xDetail = if ($xs.Total -gt 0) { " | Export: {0}/{1}({2}%)" -f $xs.Current, $xs.Total, ([int]([Math]::Round($xPct * 100))) } else { ' | Export: 0/0' } + $parsingLabel = 'Pre-parsing JSON' + if (($AgentId -or $AgentsOnly -or $ExcludeAgents -or $PromptFilter) -and $phase -eq 'Parsing') { + $parsingLabel = 'Pre-parsing + Filtering' + } + $phasePrefix = switch ($phase) { 'Parsing' { $parsingLabel } 'Query' { 'Query' } 'Explosion' { 'Explosion' } 'Export' { 'Export' } 'Complete' { 'Complete' } default { $phase } } + if ($phase -eq 'Parsing' -and $pDetail) { + $composite = "${phasePrefix}: $pDetail$eDetail$batchDetail$xDetail" + } + elseif ($phase -eq 'Explosion' -and -not $qDetail) { + $composite = "Explosion: $explosionCounts$batchDetail$xDetail" + } + else { + $composite = if ($qDetail) { "${phasePrefix}: $qDetail$eDetail$batchDetail$xDetail" } else { "${phasePrefix}:$eDetail$batchDetail$xDetail" } + } + $statusText = if ($Status) { "$Status :: $composite" } else { $composite } + if ($statusText.Length -gt 180) { $statusText = $statusText.Substring(0, 177) + '...' } + # Placeholder for progress display compatibility +} +function Complete-Progress { + # Placeholder for progress display compatibility +} + +# Lightweight explicit progress tick to ensure visual movement in long zero-record scenarios. +function Write-ProgressTick { + # Placeholder for progress display compatibility +} + +$script:learnedActivityBlockSize = @{} +$script:globalLearnedBlockSize = $BlockHours + +function Get-QueryPlan { + param([string[]]$RequestedActivities) + # Normalize and deduplicate (DSPM logic already handled fallback, so no default here) + $normalized = @(); foreach ($a in $RequestedActivities) { if ($a -and -not ($normalized -contains $a)) { $normalized += $a } } + # If still empty after DSPM logic, something went wrong - but DSPM validation should prevent this + if ($normalized.Count -eq 0) { + Write-Host "ERROR: No activity types provided to Get-QueryPlan. This should not happen after DSPM validation." -ForegroundColor Red + exit 1 + } + $plan = @(); $i = 0 + + # DUAL-MODE QUERY PLANNING: + # Graph API mode: Combine all activity types into single group (Graph API accepts multiple operationFilters) + # EOM mode: Separate groups per activity type (Search-UnifiedAuditLog performs better with single activity) + if (-not $UseEOM) { + # Graph API mode: Single group with all activities combined + $plan += @{ + Name = "Combined: $($normalized -join ', ')"; + Group = 'GraphCombined'; + Activities = $normalized; + Concurrency = $MaxConcurrency + } + } + else { + # EOM mode: One group per activity type (sequential processing) + foreach ($a in $normalized) { + $i++ + $plan += @{ + Name = "Activity: $a"; + Group = 'EOM_Sequential'; + Activities = @($a); + Concurrency = $MaxConcurrency + } + } + } + return $plan +} + +function Update-LearnedBlockSize { + param([string]$ActivityType, [double]$BlockHours, [int]$RecordCount, [bool]$Success) + if ($Success) { + if ($RecordCount -eq $ResultSize) { + $newSize = [Math]::Max(0.083333, $BlockHours * 0.5) + $script:learnedActivityBlockSize[$ActivityType] = $newSize + $script:globalLearnedBlockSize = [Math]::Min($script:globalLearnedBlockSize, $newSize) + Write-LogHost " → Learned: Reducing block size to $([math]::Round($newSize,2))h due to limit hit" -ForegroundColor Magenta + } + elseif ($RecordCount -gt ($ResultSize * 0.8)) { + $newSize = [Math]::Max(0.083333, $BlockHours * 0.7) + $script:learnedActivityBlockSize[$ActivityType] = $newSize + Write-LogHost " → Learned: Reducing block size to $([math]::Round($newSize,2))h (high volume: $RecordCount records)" -ForegroundColor Magenta + } + elseif ($RecordCount -lt ($ResultSize * 0.1)) { + $newSize = [Math]::Min(24.0, $BlockHours * 1.5) + $script:learnedActivityBlockSize[$ActivityType] = $newSize + Write-LogHost " → Learned: Increasing block size to $([math]::Round($newSize,2))h (low volume: $RecordCount records)" -ForegroundColor Magenta + } + elseif ($RecordCount -lt ($ResultSize * 0.05)) { + $newSize = [Math]::Min(24.0, $BlockHours * 2.0) + $script:learnedActivityBlockSize[$ActivityType] = $newSize + Write-LogHost " → Learned: Increasing block size to $([math]::Round($newSize,2))h (very low volume: $RecordCount records)" -ForegroundColor Magenta + } + } else { + $newSize = [Math]::Max(0.083333, $BlockHours * 0.5) + $script:learnedActivityBlockSize[$ActivityType] = $newSize + $script:globalLearnedBlockSize = [Math]::Min($script:globalLearnedBlockSize, $newSize) + Write-LogHost " → Learned: Reducing block size to $([math]::Round($newSize,2))h due to failure" -ForegroundColor Magenta + } +} +function Get-NextSmallerBlockSize { param([double]$CurrentSize) return [Math]::Max(0.001389, $CurrentSize / 2) } # Min 2 minutes + +function Get-OptimalBlockSize { param([string]$ActivityType) if ($script:learnedActivityBlockSize.ContainsKey($ActivityType)) { return $script:learnedActivityBlockSize[$ActivityType] } elseif ($script:globalLearnedBlockSize -ne $BlockHours) { return $script:globalLearnedBlockSize } else { return $BlockHours } } + +function Invoke-ActivityTimeWindowProcessing { + param( + [Parameter(Mandatory = $true)][string]$ActivityType, + [Parameter(Mandatory = $true)][datetime]$StartDate, + [Parameter(Mandatory = $true)][datetime]$EndDate, + [int]$PartitionIndex = 1, + [int]$TotalPartitions = 1, + [bool]$UseEOMMode = $false + ) + + Write-Host "Processing $ActivityType (partition $PartitionIndex/$TotalPartitions) from $($StartDate.ToString('yyyy-MM-dd HH:mm')) to $($EndDate.ToString('yyyy-MM-dd HH:mm'))..." -ForegroundColor White + $blockHours = Get-OptimalBlockSize -ActivityType $ActivityType + Write-Host " Using initial block size: $blockHours hours" -ForegroundColor DarkCyan + + $allResults = New-Object System.Collections.ArrayList + $current = $StartDate + $blockNumber = 1 + + while ($current -lt $EndDate) { + # Show progress BEFORE block processing to ensure visibility (before log output clears it) + Write-ProgressTick + + if ($script:circuitBreakerOpen) { + if ($script:circuitBreakerOpenUntil -and (Get-Date) -lt $script:circuitBreakerOpenUntil) { + Write-LogHost " Circuit breaker OPEN until $($script:circuitBreakerOpenUntil.ToString('HH:mm:ss')) – skipping remaining blocks for $ActivityType" -ForegroundColor Red + break + } else { + $script:circuitBreakerOpen = $false + $script:consecutiveBlockFailures = 0 + Write-LogHost " Circuit breaker cooldown elapsed – resuming block processing" -ForegroundColor DarkGreen + } + } + if ($script:learnedActivityBlockSize.ContainsKey($ActivityType)) { + $blockHours = $script:learnedActivityBlockSize[$ActivityType] + } + + $blockEnd = $current.AddHours($blockHours) + if ($blockEnd -gt $EndDate) { $blockEnd = $EndDate } + + $actualBlockHours = [math]::Round(($blockEnd - $current).TotalHours, 2) + Write-Host " Block $blockNumber`: $($current.ToString('yyyy-MM-dd HH:mm')) to $($blockEnd.ToString('yyyy-MM-dd HH:mm')) ($($actualBlockHours)h)" -ForegroundColor Yellow + + try { + $results = Invoke-PurviewAuditQuery -StartDate $current -EndDate $blockEnd -Operations $ActivityType -ResultSize $ResultSize -UserIds $script:targetUsers -UseEOMMode $UseEOMMode + + if ($results -and $results.Count -gt 0) { + # Safe add - handle both array and single object + if ($results -is [Array]) { + foreach ($item in $results) { [void]$allResults.Add($item) } + } else { + [void]$allResults.Add($results) + } + Write-Host " Added $($results.Count) records (total: $($allResults.Count))" -ForegroundColor Green + Update-LearnedBlockSize -ActivityType $ActivityType -BlockHours $actualBlockHours -RecordCount $results.Count -Success $true + $script:consecutiveBlockFailures = 0 + } + else { + Write-Host " No records found in this block" -ForegroundColor Gray + $script:consecutiveBlockFailures = 0 + } + } + catch { + Write-Host " Block failed: $($_.Exception.Message)" -ForegroundColor Red + Update-LearnedBlockSize -ActivityType $ActivityType -BlockHours $actualBlockHours -RecordCount 0 -Success $false + $script:consecutiveBlockFailures++ + $attemptNum = $script:consecutiveBlockFailures + $expDelay = [math]::Min($BackoffMaxSeconds, $BackoffBaseSeconds * [math]::Pow(2, ($attemptNum - 1))) + $jitterMs = Get-Random -Minimum 150 -Maximum 750 + $totalDelaySec = [math]::Round($expDelay,2) + [math]::Round($jitterMs/1000,2) + try { $script:metrics.BackoffTotalDelaySeconds += $totalDelaySec } catch {} + Write-LogHost " Reliability: Backoff delay $([math]::Round($expDelay,2))s + jitter $([math]::Round($jitterMs/1000,2))s (attempt $attemptNum)" -ForegroundColor DarkYellow + Start-Sleep -Seconds ([int][math]::Ceiling($expDelay)) + Start-Sleep -Milliseconds $jitterMs + if ($script:consecutiveBlockFailures -ge $CircuitBreakerThreshold) { + $script:circuitBreakerOpen = $true + $script:circuitBreakerOpenUntil = (Get-Date).AddSeconds($CircuitBreakerCooldownSeconds) + try { $script:metrics.CircuitBreakerTrips++ } catch {} + Write-LogHost " CIRCUIT BREAKER TRIPPED after $script:consecutiveBlockFailures consecutive block failures – cooling down for $CircuitBreakerCooldownSeconds seconds (until $($script:circuitBreakerOpenUntil.ToString('HH:mm:ss')))" -ForegroundColor Magenta + break + } + if ($blockHours -gt 0.5) { + $smallerBlockHours = Get-NextSmallerBlockSize -CurrentSize $blockHours + Write-Host " Retrying with smaller $smallerBlockHours hour block..." -ForegroundColor Yellow + + try { + $blockEnd = $current.AddHours($smallerBlockHours) + if ($blockEnd -gt $EndDate) { $blockEnd = $EndDate } + + $results = Invoke-PurviewAuditQuery -StartDate $current -EndDate $blockEnd -Operations $ActivityType -ResultSize $ResultSize -UserIds $script:targetUsers -UseEOMMode $UseEOMMode + + if ($results -and $results.Count -gt 0) { + # Safe add - handle both array and single object + if ($results -is [Array]) { + foreach ($item in $results) { [void]$allResults.Add($item) } + } else { + [void]$allResults.Add($results) + } + Write-Host " Smaller block succeeded: $($results.Count) records" -ForegroundColor Green + Update-LearnedBlockSize -ActivityType $ActivityType -BlockHours $smallerBlockHours -RecordCount $results.Count -Success $true + $blockHours = $smallerBlockHours + $script:consecutiveBlockFailures = 0 + } + } + catch { + Write-Host " Smaller block also failed: $($_.Exception.Message)" -ForegroundColor Red + $script:consecutiveBlockFailures++ + $attemptNum = $script:consecutiveBlockFailures + $expDelay = [math]::Min($BackoffMaxSeconds, $BackoffBaseSeconds * [math]::Pow(2, ($attemptNum - 1))) + $jitterMs = Get-Random -Minimum 150 -Maximum 750 + $totalDelaySec = [math]::Round($expDelay,2) + [math]::Round($jitterMs/1000,2) + try { $script:metrics.BackoffTotalDelaySeconds += $totalDelaySec } catch {} + Write-LogHost " Reliability: Backoff delay $([math]::Round($expDelay,2))s + jitter $([math]::Round($jitterMs/1000,2))s (attempt $attemptNum)" -ForegroundColor DarkYellow + Start-Sleep -Seconds ([int][math]::Ceiling($expDelay)) + Start-Sleep -Milliseconds $jitterMs + if ($script:consecutiveBlockFailures -ge $CircuitBreakerThreshold) { + $script:circuitBreakerOpen = $true + $script:circuitBreakerOpenUntil = (Get-Date).AddSeconds($CircuitBreakerCooldownSeconds) + try { $script:metrics.CircuitBreakerTrips++ } catch {} + Write-LogHost " CIRCUIT BREAKER TRIPPED after $script:consecutiveBlockFailures consecutive block failures – cooling down for $CircuitBreakerCooldownSeconds seconds (until $($script:circuitBreakerOpenUntil.ToString('HH:mm:ss')))" -ForegroundColor Magenta + break + } + } + } + } + + try { + if ($script:progressState.Query.Current -ge $script:progressState.Query.Total) { + $script:progressState.Query.Total += 1 + } + $script:progressState.Query.Current += 1 + $script:progressBlocksCompleted = ($script:progressBlocksCompleted + 1) + $script:progressBlockHoursSum = ($script:progressBlockHoursSum + $actualBlockHours) + if ($script:progressBlocksCompleted -gt 0) { + # --- Progress Estimation Logic (Improved for multi-partition accuracy) --- + # Previously, the dynamic recalculation only considered the current partition's remaining hours. + # In multi-partition scenarios this allowed Query.Total to shrink between partitions, causing + # premature 100% completion when later partitions had not yet started. + # New approach: + # 1. Estimate remaining blocks in the CURRENT partition (as before). + # 2. Add an estimate for yet-to-start partitions based on the average blocks/partition so far. + # 3. Enforce a monotonic (non-decreasing) Query.Total so percent cannot jump to 100% early. + $avgBlock = $script:progressBlockHoursSum / $script:progressBlocksCompleted + $elapsedHours = $script:progressBlockHoursSum + $currentPartitionRangeHours = ($EndDate - $StartDate).TotalHours + $remainingHoursCurrentPartition = [Math]::Max(0.0, $currentPartitionRangeHours - $elapsedHours) + $remainingBlocksEstCurrent = if ($avgBlock -gt 0) { [Math]::Ceiling($remainingHoursCurrentPartition / $avgBlock) } else { 0 } + $remainingPartitions = if ($TotalPartitions -gt $PartitionIndex) { $TotalPartitions - $PartitionIndex } else { 0 } + $avgBlocksPerCompletedPartition = if ($PartitionIndex -gt 0) { [double]$script:progressBlocksCompleted / [double]$PartitionIndex } else { [double]$script:progressBlocksCompleted } + $futurePartitionBlocksEst = if ($remainingPartitions -gt 0 -and $avgBlocksPerCompletedPartition -gt 0) { [int][Math]::Ceiling($avgBlocksPerCompletedPartition * $remainingPartitions) } else { 0 } + $newCalcGlobal = $script:progressBlocksCompleted + $remainingBlocksEstCurrent + $futurePartitionBlocksEst + # Apply optional smoothing but NEVER allow total to decrease (monotonic total). + if ($ProgressSmoothingAlpha -gt 0 -and $script:progressState.Query.Total -gt 0) { + $smoothed = [int]([Math]::Round(($ProgressSmoothingAlpha * $newCalcGlobal) + ((1 - $ProgressSmoothingAlpha) * $script:progressState.Query.Total))) + $newTotalCandidate = [Math]::Max($script:progressState.Query.Total, $smoothed, $newCalcGlobal) + } else { + $newTotalCandidate = [Math]::Max($script:progressState.Query.Total, $newCalcGlobal) + } + $script:progressState.Query.Total = [Math]::Max($script:progressState.Query.Total, $newTotalCandidate, $script:progressBlocksCompleted) + } + Update-Progress + # Explicit tick for visibility even if Update-Progress weighting collapses. + Write-ProgressTick + } + catch {} + + $current = $blockEnd + $blockNumber++ + } + + Write-Host " Completed $ActivityType (partition $PartitionIndex/$TotalPartitions)`: $($allResults.Count) total records" -ForegroundColor Green + return $allResults.ToArray() +} + +# Note: Write-Log and Write-LogHost are defined earlier in the script (near line 670) + +function Open-CsvWriter { + param([string]$Path, [string[]]$Columns) + $enc = New-Object System.Text.UTF8Encoding($false) + # OPTIMIZATION: Use 1MB StreamWriter buffer (default is 1KB) to reduce write syscalls + $script:PAX_CsvWriter = [System.IO.StreamWriter]::new($Path, $false, $enc, 1048576) + $escapedCols = New-Object System.Collections.Generic.List[string] + foreach ($col in $Columns) { + $c = [string]$col + $needsQuote = ($c -match '[",\r\n]') -or $c.StartsWith(' ') -or $c.EndsWith(' ') + $escaped = $c -replace '"', '""' + if ($needsQuote) { $escaped = '"' + $escaped + '"' } + $escapedCols.Add($escaped) | Out-Null + } + $script:PAX_CsvWriter.WriteLine(($escapedCols -join ',')) +} +function Close-CsvWriter { if ($script:PAX_CsvWriter) { try { $script:PAX_CsvWriter.Flush(); $script:PAX_CsvWriter.Dispose() } catch {}; Remove-Variable PAX_CsvWriter -Scope Script -ErrorAction SilentlyContinue } } +function Write-CsvRows { + param([System.Collections.IEnumerable]$Rows, [string[]]$Columns) + if (-not $Rows) { return } + if (-not $script:PAX_CsvWriter) { throw "CSV writer not initialized" } + + # Pre-compile regex once (not per-cell) for significant performance gain + $needsQuotePattern = [regex]::new('[",\r\n]', [System.Text.RegularExpressions.RegexOptions]::Compiled) + + # OPTIMIZATION: Pre-allocate larger buffer (4MB) to reduce write syscalls + $sb = New-Object System.Text.StringBuilder(4194304) + $colCount = $Columns.Count + $fieldValues = New-Object string[] $colCount # Reuse array instead of creating List per row + + # OPTIMIZATION: Build column index lookup table for O(1) access by name + # This eliminates per-cell string lookups which were the main bottleneck + $columnIndex = @{} + for ($i = 0; $i -lt $colCount; $i++) { + $columnIndex[$Columns[$i]] = $i + } + + foreach ($row in $Rows) { + if ($null -eq $row) { continue } + + # Reset field values (faster than creating new array) + for ($i = 0; $i -lt $colCount; $i++) { $fieldValues[$i] = "" } + + # For hashtables, iterate keys and map to column index (much faster than iterating columns) + if ($row -is [hashtable]) { + foreach ($key in $row.Keys) { + if ($columnIndex.ContainsKey($key)) { + $idx = $columnIndex[$key] + $val = $row[$key] + + if ($null -eq $val) { continue } + + # Handle arrays/collections + if ($val -is [System.Collections.IEnumerable] -and -not ($val -is [string])) { + try { $val = ($val | ForEach-Object { if ($_ -ne $null) { [string]$_ } else { '' } }) -join ';' } catch { $val = [string]$val } + } + + $s = [string]$val + # Use pre-compiled regex and avoid method calls where possible + if ($s.Length -gt 0 -and ($needsQuotePattern.IsMatch($s) -or $s[0] -eq ' ' -or $s[$s.Length - 1] -eq ' ')) { + $s = '"' + ($s -replace '"', '""') + '"' + } + $fieldValues[$idx] = $s + } + } + } else { + # OPTIMIZED: For PSObjects, iterate only populated properties (not all columns) + # This reduces iterations from ~163 columns to ~50 actual properties per row + foreach ($prop in $row.PSObject.Properties) { + $key = $prop.Name + if (-not $columnIndex.ContainsKey($key)) { continue } + + $idx = $columnIndex[$key] + $val = $prop.Value + + if ($null -eq $val) { continue } + + # Handle arrays/collections + if ($val -is [System.Collections.IEnumerable] -and -not ($val -is [string])) { + try { $val = ($val | ForEach-Object { if ($_ -ne $null) { [string]$_ } else { '' } }) -join ';' } catch { $val = [string]$val } + } + + $s = [string]$val + if ($s.Length -gt 0 -and ($needsQuotePattern.IsMatch($s) -or $s[0] -eq ' ' -or $s[$s.Length - 1] -eq ' ')) { + $s = '"' + ($s -replace '"', '""') + '"' + } + $fieldValues[$idx] = $s + } + } + + [void]$sb.AppendLine(($fieldValues -join ',')) + # Flush at 4MB (increased from 1MB to reduce write syscalls) + if ($sb.Length -gt 4194304) { + $script:PAX_CsvWriter.Write($sb.ToString()) + [void]$sb.Clear() + } + } + if ($sb.Length -gt 0) { $script:PAX_CsvWriter.Write($sb.ToString()) } +} + +function Test-AgentFilter { + param( + [Parameter(Mandatory = $true)] + $ParsedAuditData, + [string[]]$AgentIdFilter, + [bool]$AgentsOnlyFilter + ) + if (-not $AgentIdFilter -and -not $AgentsOnlyFilter) { + return $true + } + $recordAgentId = $null + try { + if ($ParsedAuditData.AgentId) { + $recordAgentId = [string]$ParsedAuditData.AgentId + } + } + catch { + return $false + } + if ($AgentsOnlyFilter) { + if ([string]::IsNullOrWhiteSpace($recordAgentId)) { + return $false + } + if (-not $AgentIdFilter) { + return $true + } + } + if ($AgentIdFilter) { + if ([string]::IsNullOrWhiteSpace($recordAgentId)) { + return $false + } + foreach ($filterId in $AgentIdFilter) { + if ($recordAgentId -eq $filterId) { + return $true + } + } + return $false + } + return $true +} + +# Ensure output directory exists +if (-not (Test-Path $OutputPath)) { New-Item -Path $OutputPath -ItemType Directory -Force | Out-Null } + +# Generate output filename with proper extension +$isDSPMEnabled = Test-DSPMFeaturesEnabled +$fileExtension = if ($ExportWorkbook) { "xlsx" } else { "csv" } +$filePrefix = "Purview_Audit" + +# Determine initial combine mode (needed early for filename decision) +$isCsv = (-not $ExportWorkbook) +$initialCsvCombine = if ($RAWInputCSV -and $isCsv) { $true } elseif ($isCsv) { $CombineOutput.IsPresent } else { $false } + +# Determine script mode for logging and validation +$scriptMode = if ($RAWInputCSV) { + "Replay (RAWInputCSV)" +} elseif ($ExplodeDeep) { + "Deep Column Explosion (-ExplodeDeep)" +} elseif ($ExplodeArrays -or $ForcedRawInputCsvExplosion) { + if ($ForcedRawInputCsvExplosion -and -not $ExplodeArrays.IsPresent -and -not $ExplodeDeep.IsPresent) { "Array Explosion (-ExplodeArrays, RAWInput implied)" } else { "Array Explosion (-ExplodeArrays)" } +} else { + "Standard (1:1, no explosion)" +} + +# Resolve OutputFile path +# OnlyUserInfo mode: Output file is EntraUsers_MAClicensing (log file will match) +if ($OnlyUserInfo) { + $fileExtension = if ($ExportWorkbook) { "xlsx" } else { "csv" } + $OutputFile = Join-Path $OutputPath "EntraUsers_MAClicensing_$global:ScriptRunTimestamp.$fileExtension" +} +elseif ($AppendFile) { + # User provided filename or full path for appending + if ([System.IO.Path]::IsPathRooted($AppendFile)) { + # Full path provided + $OutputFile = $AppendFile + } else { + # Relative filename - combine with OutputPath + $OutputFile = Join-Path $OutputPath $AppendFile + } + + # Validate file exists + if (-not (Test-Path $OutputFile)) { + Write-Host "ERROR: Cannot append to file - file does not exist: $OutputFile" -ForegroundColor Red + Write-Host "" -ForegroundColor Yellow + Write-Host "The file must exist before you can append to it." -ForegroundColor Yellow + Write-Host "Either:" -ForegroundColor Green + Write-Host " 1. Create the file first by running without -AppendFile" -ForegroundColor Green + Write-Host " 2. Verify the path and filename are correct" -ForegroundColor Green + exit 1 + } + + # Note: Column validation happens after banner display +} +elseif ($ExportWorkbook) { + # Excel workbook mode - determine final filename upfront so log file matches + if ($CombineOutput) { + # Single-tab workbook + if ($IncludeUserInfo -and -not $UseEOM) { + $OutputFile = Join-Path $OutputPath "Purview_Audit_CombinedUsageActivity_EntraUsers_MAClicensing_$global:ScriptRunTimestamp.xlsx" + } else { + $OutputFile = Join-Path $OutputPath "Purview_Audit_CombinedUsageActivity_$global:ScriptRunTimestamp.xlsx" + } + } else { + # Multi-tab workbook + if ($isDSPMEnabled) { + $OutputFile = Join-Path $OutputPath "Purview_Audit_MultiTab_$global:ScriptRunTimestamp.xlsx" + } else { + $OutputFile = Join-Path $OutputPath "Purview_Audit_MultiTab_$global:ScriptRunTimestamp.xlsx" + } + } +} elseif ($isCsv -and $initialCsvCombine) { + # CSV combined activity naming; EntraUsers exported separately + # Initial default combined output filename (may be dynamically downgraded to single-activity later) + $OutputFile = Join-Path $OutputPath "Purview_Audit_CombinedUsageActivity_$global:ScriptRunTimestamp.csv" +} else { + $OutputFile = Join-Path $OutputPath "${filePrefix}_$global:ScriptRunTimestamp.$fileExtension" +} + +# Ensure output directory exists after possible OutputPath override +if (-not (Test-Path $OutputPath)) { New-Item -Path $OutputPath -ItemType Directory -Force | Out-Null } + +# When ExportWorkbook mode, set up intermediate CSV path (OutputFile stays as final .xlsx for log naming) +if ($ExportWorkbook -and -not $AppendFile) { + # CSV intermediate file uses same base name but .csv extension + $script:CsvOutputFile = Join-Path $OutputPath ([System.IO.Path]::GetFileNameWithoutExtension($OutputFile) + ".csv") +} else { + $script:CsvOutputFile = $OutputFile +} + +# ============================================================ +# CHECKPOINT SYSTEM: Initialize for new runs or set paths for resume +# ============================================================ +# Checkpoint is enabled for ALL auth modes (AppRegistration, WebLogin, DeviceCode) +# Enables resume after Ctrl+C, network interruptions, system restarts, or any failure +$script:CheckpointEnabled = (-not $RAWInputCSV) -and (-not $OnlyUserInfo) + +# NOTE: Use $ResumeSpecified here (set early via RemainingArgs parsing) rather than $script:IsResumeMode +# which is only set later during resume detection. This skips new checkpoint creation when -Resume is specified. +if ($ResumeSpecified) { + # Resume mode: checkpoint paths will be set later during resume detection + # Skip all checkpoint initialization here - it will be handled in the resume detection block + $script:FinalOutputPath = $null # Will be set during resume + $script:PartialOutputPath = $null # Will be set during resume + # Also skip log file setup - will be set after checkpoint is loaded + $script:DeferLogFileSetup = $true +} +elseif ($script:CheckpointEnabled) { + # New run with checkpoint enabled: Add _PARTIAL suffix + $script:FinalOutputPath = $OutputFile + $dir = Split-Path $OutputFile -Parent + $baseName = [System.IO.Path]::GetFileNameWithoutExtension($OutputFile) + $ext = [System.IO.Path]::GetExtension($OutputFile) + $script:PartialOutputPath = Join-Path $dir "${baseName}_PARTIAL${ext}" + $OutputFile = $script:PartialOutputPath + # For ExportWorkbook mode, CsvOutputFile must use .csv extension (not .xlsx from PartialOutputPath) + if ($ExportWorkbook) { + $script:CsvOutputFile = Join-Path $dir "${baseName}_PARTIAL.csv" + } else { + $script:CsvOutputFile = $script:PartialOutputPath + } + + # Initialize checkpoint file for this run with ALL parameters for complete state restoration + $baseFileName = Split-Path $script:FinalOutputPath -Leaf + $allParams = @{ + # Date range + StartDate = $StartDate + EndDate = $EndDate + # Activity/Record filtering + ActivityTypes = $ActivityTypes + RecordTypes = $RecordTypes + ServiceTypes = $ServiceTypes + UserIds = $UserIds + GroupNames = $GroupNames + # Agent filtering + AgentId = $AgentId + AgentsOnly = $AgentsOnly.IsPresent + ExcludeAgents = $ExcludeAgents.IsPresent + # Prompt filtering + PromptFilter = $PromptFilter + # Schema/Explosion + ExplodeArrays = $ExplodeArrays.IsPresent + ExplodeDeep = $ExplodeDeep.IsPresent + FlatDepth = $FlatDepth + StreamingSchemaSample = $StreamingSchemaSample + StreamingChunkSize = $StreamingChunkSize + # M365/User info + IncludeM365Usage = $IncludeM365Usage.IsPresent + IncludeUserInfo = $IncludeUserInfo.IsPresent + IncludeDSPMForAI = $IncludeDSPMForAI.IsPresent + IncludeCopilotInteraction = $IncludeCopilotInteraction.IsPresent + ExcludeCopilotInteraction = $ExcludeCopilotInteraction.IsPresent + # Partitioning + BlockHours = $BlockHours + PartitionHours = $PartitionHours + MaxPartitions = $MaxPartitions + # Output + ExportWorkbook = $ExportWorkbook.IsPresent + CombineOutput = $CombineOutput.IsPresent + # Auth (no secrets) + Auth = $Auth + TenantId = $TenantId + ClientId = $ClientId + # Other + ResultSize = $ResultSize + MaxConcurrency = $MaxConcurrency + UseEOM = $UseEOM.IsPresent + AutoCompleteness = $AutoCompleteness.IsPresent + IncludeTelemetry = $IncludeTelemetry.IsPresent + } + Initialize-CheckpointForNewRun -OutputPath $OutputPath -BaseOutputFileName $baseFileName -RunTimestamp $global:ScriptRunTimestamp -StartDate (script:Parse-DateSafe $StartDate) -EndDate (script:Parse-DateSafe $EndDate) -AllParameters $allParams +} +else { + # No checkpoint needed (AppRegistration mode or RAWInputCSV or OnlyUserInfo) + $script:FinalOutputPath = $OutputFile + $script:PartialOutputPath = $null +} + +# Update LogFile to match OutputFile base name (extension swapped to .log) +# Skip for resume mode - log file will be set after checkpoint is loaded +if (-not $script:DeferLogFileSetup) { + $logBaseName = [System.IO.Path]::GetFileNameWithoutExtension($OutputFile) + $logDir = Split-Path $OutputFile -Parent + $script:LogFile = Join-Path $logDir ("{0}.log" -f $logBaseName) + $LogFile = $script:LogFile +} + +# Flush buffered logs now that log file is finalized (skip for resume - will flush after checkpoint load) +if ($script:LogBuffer -and $script:LogBuffer.Count -gt 0) { + foreach ($entry in $script:LogBuffer) { + try { Add-Content -Path $script:LogFile -Value $entry -Encoding UTF8 -ErrorAction SilentlyContinue } catch {} + } + $script:LogBuffer.Clear() +} + +# Note: $scriptMode already defined earlier for validation - reformat for display consistency +$scriptModeDisplay = if ($ExplodeDeep) { "Deep Column Explosion" } elseif ($ExplodeArrays -or $ForcedRawInputCsvExplosion) { if ($ForcedRawInputCsvExplosion -and -not $ExplodeArrays.IsPresent -and -not $ExplodeDeep.IsPresent) { "Array Explosion (RAWInput implied)" } else { "Array Explosion" } } else { "Standard (1:1)" } + +# Clean display names — strip _PARTIAL suffix so the startup banner always shows the expected final filename +$displayOutputFile = $OutputFile -replace '_PARTIAL(?=\.[^.]+$)', '' +$displayLogFile = $LogFile -replace '_PARTIAL(?=\.log$)', '' + +# Skip banner output to log file in resume mode (log file not set yet - will be set after checkpoint loads) +if (-not $script:DeferLogFileSetup) { +@" +=== Portable Audit eXporter (PAX) - Purview Audit Log Exporter === +Script Start Time (UTC): $((Get-Date).ToUniversalTime().ToString('yyyy-MM-dd HH:mm:ss')) UTC +Script Version: v$ScriptVersion +Mode: $scriptModeDisplay +Date Range: $(if ($RAWInputCSV) { if ([string]::IsNullOrWhiteSpace($StartDate) -and [string]::IsNullOrWhiteSpace($EndDate)) { 'Full CSV (no date filter)' } else { "$StartDate (inclusive) to $EndDate (exclusive) (filters)" } } else { "$StartDate (inclusive) to $EndDate (exclusive)" }) +Output File: $displayOutputFile +Log File: $displayLogFile +======================================================== + +"@ | Out-File -FilePath $LogFile -Encoding UTF8 +} + +Write-LogHost "=== Portable Audit eXporter (PAX) - Purview Audit Log Exporter ===" -ForegroundColor Cyan +Write-LogHost ("Script Version: v$ScriptVersion") -ForegroundColor White + + +# Fast-path: ensure M365 usage bundle is applied before output summary in raw/replay scenarios +if ($IncludeM365Usage -and -not ($PSBoundParameters.ContainsKey('ActivityTypes'))) { + $ActivityTypes = @($m365UsageActivityBundle + $copilotBaseActivityType) | Select-Object -Unique + # Activity types will be displayed in "Activity Types for This Run" section +} + +# Display active mode (Replay, EOM, or Graph API) +Write-LogHost "" +if ($RAWInputCSV) { + Write-LogHost "═══════════════════════════════════════════════════════" -ForegroundColor Cyan + Write-LogHost " REPLAY MODE: Offline CSV (no service connections)" -ForegroundColor Cyan + Write-LogHost "═══════════════════════════════════════════════════════" -ForegroundColor Cyan + Write-LogHost " Source: $RAWInputCSV" -ForegroundColor White + Write-LogHost " Explosion: $scriptModeDisplay" -ForegroundColor White + Write-LogHost "═══════════════════════════════════════════════════════" -ForegroundColor Cyan +} +elseif ($UseEOM) { + Write-LogHost "═══════════════════════════════════════════════════════" -ForegroundColor Cyan + Write-LogHost " QUERY MODE: Exchange Online Management" -ForegroundColor Cyan + Write-LogHost "═══════════════════════════════════════════════════════" -ForegroundColor Cyan + Write-LogHost " API Method: Search-UnifiedAuditLog cmdlet" -ForegroundColor White + Write-LogHost " Module: ExchangeOnlineManagement" -ForegroundColor White + Write-LogHost " Authentication: $Auth" -ForegroundColor White + Write-LogHost " Parallel Support: DISABLED (serial-only processing)" -ForegroundColor Yellow + Write-LogHost " Permissions: Exchange Online RBAC roles required" -ForegroundColor White + Write-LogHost " (View-Only Audit Logs, Compliance Management)" -ForegroundColor Gray + Write-LogHost "═══════════════════════════════════════════════════════" -ForegroundColor Cyan +} +else { + Write-LogHost "═══════════════════════════════════════════════════════" -ForegroundColor Green + Write-LogHost " QUERY MODE: Microsoft Graph Security API (Default)" -ForegroundColor Green + Write-LogHost "═══════════════════════════════════════════════════════" -ForegroundColor Green + Write-LogHost " API Method: REST-based audit log queries" -ForegroundColor White + Write-LogHost " Module: Microsoft.Graph.Security" -ForegroundColor White + Write-LogHost " Authentication: $Auth (OAuth 2.0)" -ForegroundColor White + $parallelStatus = if ($PSVersionTable.PSVersion.Major -ge 7) { "AVAILABLE (PowerShell 7+)" } else { "LIMITED (PowerShell 5.1 detected)" } + Write-LogHost " Parallel Support: $parallelStatus" -ForegroundColor Green + Write-LogHost " Permissions: AuditLog.Read.All Graph API scope" -ForegroundColor White + Write-LogHost " + Azure AD role (Compliance/Security Admin)" -ForegroundColor Gray + Write-LogHost "═══════════════════════════════════════════════════════" -ForegroundColor Green +} +Write-LogHost "" + +$startTimeStamp = try { $script:metrics.StartTime.ToUniversalTime().ToString('yyyy-MM-dd HH:mm:ss') } catch { (Get-Date).ToUniversalTime().ToString('yyyy-MM-dd HH:mm:ss') } +Write-LogHost ("Script execution started at $startTimeStamp UTC") -ForegroundColor White + +# For OnlyUserInfo mode, show simplified header (no Mode/DateRange/Purview output info) +if (-not $OnlyUserInfo) { + Write-LogHost "Mode: $scriptMode" -ForegroundColor White + ${rangeText} = if ($RAWInputCSV) { if ([string]::IsNullOrWhiteSpace($StartDate) -and [string]::IsNullOrWhiteSpace($EndDate)) { 'Full CSV (no date filter)' } else { "$StartDate (inclusive) to $EndDate (exclusive) (filters)" } } else { "$StartDate (inclusive) to $EndDate (exclusive)" } + Write-LogHost "Date Range: $rangeText" -ForegroundColor White +} else { + Write-LogHost "Mode: OnlyUserInfo (Entra user and MAC licensing export only)" -ForegroundColor Cyan +} + +# --- Early build of $finalActivityTypes for user warning checks (before auth) --- +# This preview build is used ONLY for multi-output warning and PAYG billing warning +# The full/authoritative build happens later in the pipeline after authentication +$finalActivityTypes = @() +if ($PSBoundParameters.ContainsKey('ActivityTypes') -and $ActivityTypes) { + $finalActivityTypes += $ActivityTypes +} +if ($IncludeDSPMForAI) { + $finalActivityTypes += 'ConnectedAIAppInteraction' + $finalActivityTypes += 'AIInteraction' + $finalActivityTypes += 'AIAppInteraction' +} +if ($IncludeM365Usage) { + $finalActivityTypes += $m365UsageActivityBundle +} +# Add CopilotInteraction as default if no custom types and not excluded +$userProvidedCustomTypes = $PSBoundParameters.ContainsKey('ActivityTypes') +if (-not $ExcludeCopilotInteraction -and (-not $userProvidedCustomTypes -or $IncludeDSPMForAI)) { + if (-not ($finalActivityTypes -contains 'CopilotInteraction')) { + $finalActivityTypes += 'CopilotInteraction' + } +} +# Remove CopilotInteraction if explicitly excluded +if ($ExcludeCopilotInteraction) { + $finalActivityTypes = $finalActivityTypes | Where-Object { $_ -ne 'CopilotInteraction' } +} +$finalActivityTypes = $finalActivityTypes | Select-Object -Unique + +# --- Multi-Output Warning: Prompt when many files/tabs expected without -CombineOutput --- +$activityTypeCount = $finalActivityTypes.Count +$isMultiOutputScenario = ($activityTypeCount -gt 10) -and (-not $CombineOutput) +$outputType = if ($ExportWorkbook) { "tabs" } else { "CSV files" } + +if ($isMultiOutputScenario -and -not $Force) { + Write-LogHost "" + Write-LogHost "============================================================================================================" -ForegroundColor Yellow + Write-Host "WARNING: Multiple Output $outputType Detected" -ForegroundColor Yellow + Write-LogHost "============================================================================================================" -ForegroundColor Yellow + Write-LogHost "" + Write-LogHost "You have $activityTypeCount activity types selected." -ForegroundColor Cyan + Write-LogHost "Without -CombineOutput, this will create $activityTypeCount separate $outputType." -ForegroundColor Cyan + Write-LogHost "" + Write-LogHost "Recommendation:" -ForegroundColor Green + Write-LogHost " • Add -CombineOutput to merge all activity types into a single $(if ($ExportWorkbook) { 'tab' } else { 'CSV file' })" -ForegroundColor Green + Write-LogHost "" + Write-LogHost "Do you want to continue with $activityTypeCount separate $outputType?" -ForegroundColor White + Write-LogHost "" + Write-LogHost " [Y] YES - Continue with separate $outputType (I understand there will be many $outputType)" -ForegroundColor Green + Write-LogHost " [C] COMBINE - Enable -CombineOutput and continue (single merged $(if ($ExportWorkbook) { 'tab' } else { 'CSV' }))" -ForegroundColor Cyan + Write-LogHost " [E] EXIT - Cancel script execution" -ForegroundColor Red + Write-LogHost "" + + Send-PromptNotification + $multiOutput_choice = Read-Host "Enter your choice (Y/C/E)" + + if ($multiOutput_choice -eq 'Y' -or $multiOutput_choice -eq 'y') { + Write-LogHost "" + Write-LogHost "Continuing with $activityTypeCount separate output files..." -ForegroundColor Green + Write-LogHost "" + } + elseif ($multiOutput_choice -eq 'C' -or $multiOutput_choice -eq 'c') { + Write-LogHost "" + Write-LogHost "ENABLED: -CombineOutput mode" -ForegroundColor Green + Write-LogHost " All $activityTypeCount activity types will be merged into a single CSV file." -ForegroundColor Cyan + Write-LogHost "" + $CombineOutput = $true + } + else { + Write-LogHost "" + Write-LogHost "User choice: EXIT - Script execution cancelled" -ForegroundColor Red + Write-LogHost "" + exit 0 + } +} +elseif ($isMultiOutputScenario -and $Force) { + Write-LogHost "Force mode: Skipping multi-output warning ($activityTypeCount activity types, separate $outputType)" -ForegroundColor DarkGray + Write-LogHost " → Continuing with separate $outputType (use -CombineOutput to merge if desired)" -ForegroundColor DarkGray +} +# --- End Multi-Output Warning --- + +# --- DSPM for AI: Billing Information Warning --- +if (($finalActivityTypes -contains 'AIAppInteraction') -or ($finalActivityTypes -contains 'ConnectedAIAppInteraction') -or ($finalActivityTypes -contains 'AIInteraction')) { + if (-not $Force) { + Write-LogHost "" + Write-LogHost "============================================================================================================" -ForegroundColor Yellow + Write-Host "INFORMATION: DSPM for AI Audit Logging - Billing Details" -ForegroundColor Cyan + Write-LogHost "============================================================================================================" -ForegroundColor Yellow + Write-LogHost "" + Write-LogHost "DSPM Activity Types:" -ForegroundColor Cyan + Write-LogHost " • AIInteraction - FREE (Microsoft platforms: Copilot Studio, Azure AI Studio)" -ForegroundColor Green + Write-LogHost " • ConnectedAIAppInteraction - MIXED (FREE for Microsoft apps, PAYG for third-party)" -ForegroundColor Yellow + if ($finalActivityTypes -contains 'AIAppInteraction') { + Write-LogHost " • AIAppInteraction - PAYG BILLING REQUIRED (third-party AI like ChatGPT)" -ForegroundColor DarkYellow + } + Write-LogHost "" + # Check if AIAppInteraction is included - offer options + if ($finalActivityTypes -contains 'AIAppInteraction') { + Write-LogHost "[!] IMPORTANT: AIAppInteraction REQUIRES Microsoft Purview PAYG billing" -ForegroundColor Yellow + Write-LogHost "" + Write-LogHost "PAYG Requirements:" -ForegroundColor Cyan + Write-LogHost " • Azure subscription linked to M365 tenant" -ForegroundColor Cyan + Write-LogHost " • Microsoft Purview PAYG billing enabled in Compliance portal" -ForegroundColor Cyan + Write-LogHost "" + Write-LogHost "Do you have PAYG billing configured in your tenant?" -ForegroundColor White + Write-LogHost "" + Write-LogHost " [Y] YES - I have PAYG billing, continue with all DSPM types" -ForegroundColor Green + Write-LogHost " [N] NO - I don't have PAYG billing, remove AIAppInteraction and continue" -ForegroundColor Yellow + Write-LogHost " (Third-party AI records will NOT be included)" -ForegroundColor DarkGray + Write-LogHost " [E] EXIT - Cancel script execution" -ForegroundColor Red + Write-LogHost "" + + Send-PromptNotification + $payg_choice = Read-Host "Enter your choice (Y/N/E)" + + if ($payg_choice -eq 'Y' -or $payg_choice -eq 'y') { + Write-LogHost "" + Write-LogHost "Continuing with all DSPM types (AIInteraction, ConnectedAIAppInteraction, AIAppInteraction)..." -ForegroundColor Green + Write-LogHost "" + } + elseif ($payg_choice -eq 'N' -or $payg_choice -eq 'n') { + Write-LogHost "" + Write-LogHost "REMOVED: AIAppInteraction (third-party AI records will NOT be captured)" -ForegroundColor Yellow + Write-LogHost "Continuing with: AIInteraction, ConnectedAIAppInteraction (Microsoft platforms only)" -ForegroundColor Green + Write-LogHost "" + Write-LogHost "Note: Without PAYG billing, only Microsoft-hosted AI activity will be captured." -ForegroundColor Yellow + Write-LogHost " Third-party AI apps (ChatGPT, etc.) require PAYG billing." -ForegroundColor Yellow + Write-LogHost "" + + # Set flag to remove AIAppInteraction during later rebuild + $script:RemoveAIAppInteraction = $true + $finalActivityTypes = $finalActivityTypes | Where-Object { $_ -ne 'AIAppInteraction' } + } + else { + Write-LogHost "" + Write-LogHost "User choice: EXIT - Script execution cancelled" -ForegroundColor Red + Write-LogHost "" + exit 0 + } + } + else { + # No AIAppInteraction - simple Y/N prompt + Write-LogHost "Note: AIAppInteraction (PAYG-only third-party AI) is NOT included." -ForegroundColor DarkGray + Write-LogHost " Only Microsoft-hosted AI activity will be captured (AIInteraction, ConnectedAIAppInteraction)." -ForegroundColor DarkGray + Write-LogHost "" + Send-PromptNotification + $payg_choice = Read-Host "Continue with DSPM for AI export? (Y/N)" + + if ($payg_choice -eq 'Y' -or $payg_choice -eq 'y') { + Write-LogHost "" + Write-LogHost "Continuing with DSPM for AI export..." -ForegroundColor Green + Write-LogHost "" + } + else { + Write-LogHost "" + Write-LogHost "User choice: ABORT - DSPM for AI export declined" -ForegroundColor Red + Write-LogHost "Script execution cancelled by user." -ForegroundColor Yellow + Write-LogHost "" + exit 0 + } + } + } + else { + Write-LogHost "Force mode enabled: Skipping DSPM for AI billing information prompt" -ForegroundColor DarkGray + Write-LogHost "User choice: CONTINUE (Force mode - automatic acceptance)" -ForegroundColor Gray + } +} +# --- End PAYG Billing Warning --- + +# Output file/directory display based on export mode +# Note: If activity type switches are used, detailed filenames will be shown after activity types are finalized +# For OnlyUserInfo mode, only show Entra file output (skip Purview data file messaging) +if ($OnlyUserInfo) { + $outputDir = if ($OutputPath) { $OutputPath } else { "C:\Temp\" } + if ($ExportWorkbook) { + $entraOutputFile = Join-Path $outputDir "EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.xlsx" + Write-LogHost "Output File: $entraOutputFile (Entra users workbook)" -ForegroundColor White + } else { + $entraOutputFile = Join-Path $outputDir "EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.csv" + Write-LogHost "Output File: $entraOutputFile" -ForegroundColor White + } +} elseif ($AppendFile) { + # AppendFile mode: Show exact filename being appended to + $fileType = if ($ExportWorkbook) { + if ($CombineOutput) { "single-tab workbook" } else { "multi-tab workbook" } + } else { + "CSV file" + } + Write-LogHost "Output File: $displayOutputFile ($fileType)" -ForegroundColor White + Write-LogHost " Mode: Appending to existing file" -ForegroundColor Cyan +} elseif ($IncludeDSPMForAI -or $ExcludeCopilotInteraction) { + # Activity type switches present - defer detailed filename listing until after activity types are finalized + $outputDir = if ($ExportWorkbook) { + if ($OutputPath) { $OutputPath } else { "C:\Temp\" } + } else { + Split-Path $OutputFile -Parent + } + Write-LogHost "Output Directory: $outputDir\" -ForegroundColor White + Write-LogHost " (Detailed filenames will be shown after activity types are finalized)" -ForegroundColor Gray +} elseif ($ExportWorkbook) { + # Excel mode: always one .xlsx file (combined tab or multiple tabs) + $outputDir = if ($OutputPath) { $OutputPath } else { "C:\Temp\" } + if ($CombineOutput) { + # New naming: Purview_Audit_CombinedUsageActivity[_EntraUsers]_timestamp.xlsx + $baseName = "Purview_Audit_CombinedUsageActivity" + if ($IncludeUserInfo -and -not $UseEOM) { $baseName += "_EntraUsers" } + Write-LogHost "Output File: ${outputDir}${baseName}_.xlsx (single-tab workbook)" -ForegroundColor White + } else { + Write-LogHost "Output File: ${outputDir}Purview_Audit_MultiTab_.xlsx (multi-tab workbook)" -ForegroundColor White + } +} else { + # CSV mode: combined file or separate files per activity type + if ($CombineOutput) { + # Single combined CSV file + Write-LogHost "Output File: $displayOutputFile (combined - all activity types)" -ForegroundColor White + if ($IncludeUserInfo -and -not $UseEOM) { + $entraFile = (Join-Path (Split-Path $OutputFile -Parent) "EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.csv") + Write-LogHost " Entra Users File: $entraFile" -ForegroundColor Gray + } + } else { + # Separate CSV files per activity type + $outputDir = Split-Path $OutputFile -Parent + $timestamp = [System.IO.Path]::GetFileNameWithoutExtension($OutputFile) -replace '.*_(\d{8}_\d{6}).*', '$1' + Write-LogHost "Output Directory: $outputDir\" -ForegroundColor White + Write-LogHost "Output Files: ${outputDir}\Purview_Audit__${timestamp}.csv" -ForegroundColor Gray + if ($IncludeUserInfo -and -not $UseEOM) { + $entraFile = "${outputDir}\EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.csv" + Write-LogHost " Entra Users: $entraFile" -ForegroundColor Gray + } + } +} + +Write-LogHost "Log File: $displayLogFile" -ForegroundColor White +if (-not $RAWInputCSV) { + Write-LogHost "Authentication: $Auth" -ForegroundColor White +} + +if ($AgentId -or $AgentsOnly -or $ExcludeAgents -or $PromptFilter -or $UserIds -or $GroupNames) { + Write-LogHost "Filters:" -ForegroundColor Yellow + if ($AgentsOnly) { Write-LogHost " AgentsOnly: Only records with AgentId present" -ForegroundColor Gray } + if ($AgentId) { + $agentDisplay = if ($AgentId.Count -eq 1) { + "Specific AgentId: $($AgentId[0])" + } + elseif ($AgentId.Count -le 3) { + "Specific AgentIds ($($AgentId.Count)): " + ($AgentId -join '; ') + } + else { + "Specific AgentIds ($($AgentId.Count) total):" + } + Write-LogHost " $agentDisplay" -ForegroundColor Gray + if ($AgentId.Count -gt 3) { + for ($i = 0; $i -lt [Math]::Min(3, $AgentId.Count); $i++) { + $displayId = if ($AgentId[$i].Length -gt 80) { $AgentId[$i].Substring(0, 77) + '...' } else { $AgentId[$i] } + Write-LogHost " [$($i+1)] $displayId" -ForegroundColor DarkGray + } + if ($AgentId.Count -gt 3) { + Write-LogHost " ... and $($AgentId.Count - 3) more" -ForegroundColor DarkGray + } + } + } + if ($ExcludeAgents) { Write-LogHost " ExcludeAgents: Only records without AgentId" -ForegroundColor Gray } + if ($PromptFilter) { + $promptLabel = switch ($PromptFilter) { + 'Prompt' { 'Only prompts (Message_isPrompt = True)' } + 'Response' { 'Only responses (Message_isPrompt = False)' } + 'Both' { 'Both prompts and responses (Message_isPrompt = True or False)' } + 'Null' { 'Only records with no Message_isPrompt values (Null/Empty)' } + } + Write-LogHost " PromptFilter: $promptLabel" -ForegroundColor Gray + } + if ($UserIds -or $GroupNames) { + if ($UserIds) { + if ($UserIds.Count -eq 1) { Write-LogHost " UserIds: 1 user" -ForegroundColor Gray } else { Write-LogHost " UserIds: $($UserIds.Count) users" -ForegroundColor Gray } + } + if ($GroupNames) { + if ($GroupNames.Count -eq 1) { Write-LogHost " GroupNames: 1 group" -ForegroundColor Gray } else { Write-LogHost " GroupNames: $($GroupNames.Count) groups" -ForegroundColor Gray } + } + } +} + +Write-LogHost "=============================================" -ForegroundColor Cyan +Write-LogHost "" + +# Now perform AppendFile validation if needed (after banner display) +if ($AppendFile) { + + $validation = Test-AppendFileCompatibility ` + -FilePath $OutputFile ` + -IsExcel $ExportWorkbook ` + -ExplodeArrays:$ExplodeArrays ` + -ExplodeDeep:$ExplodeDeep + + if (-not $validation.Compatible) { + Write-Host "" + Write-Host "════════════════════════════════════════════════════════════════════" -ForegroundColor Red + Write-Host " ERROR: Explosion Parameter Mismatch - Cannot Append" -ForegroundColor Red + Write-Host "════════════════════════════════════════════════════════════════════" -ForegroundColor Red + Write-Host "" + Write-Host "The existing file was created with different explosion parameters than" -ForegroundColor Yellow + Write-Host "the current command. Appending would create incompatible data structures." -ForegroundColor Yellow + Write-Host "" + Write-Host "Existing file: $OutputFile" -ForegroundColor White + Write-Host " Columns: $($validation.ExistingCount)" -ForegroundColor Gray + Write-Host " Mode: $($validation.ExistingMode.DisplayName)" -ForegroundColor Gray + Write-Host "" + Write-Host "Current command:" -ForegroundColor White + Write-Host " Mode: $($validation.CurrentMode.DisplayName)" -ForegroundColor Gray + Write-Host "" + Write-Host "Root Cause:" -ForegroundColor Cyan + Write-Host " Explosion parameters must match between original file and append operation." -ForegroundColor Yellow + Write-Host "" + Write-Host "Resolution Options:" -ForegroundColor Cyan + Write-Host " 1. Match the original file's parameters:" -ForegroundColor White + Write-Host " Use: $($validation.ExistingMode.DisplayName)" -ForegroundColor Gray + Write-Host " 2. Create new output file instead:" -ForegroundColor White + Write-Host " Remove -AppendFile parameter" -ForegroundColor Gray + + if ($ExportWorkbook) { + Write-Host "" + Write-Host "Note for Excel mode:" -ForegroundColor DarkGray + Write-Host " If parameters matched, mismatched columns would create timestamped" -ForegroundColor DarkGray + Write-Host " duplicate tabs instead of appending (no data loss)." -ForegroundColor DarkGray + } + else { + Write-Host "" + Write-Host "CRITICAL for CSV mode:" -ForegroundColor Yellow + Write-Host " CSV append with mismatched explosion parameters creates CORRUPTED files!" -ForegroundColor Yellow + Write-Host " This validation prevents data corruption by failing early." -ForegroundColor Yellow + } + + Write-Host "" + Write-Host "════════════════════════════════════════════════════════════════════" -ForegroundColor Red + exit 1 + } + + Write-LogHost " Explosion parameters compatible ($($validation.CurrentMode.DisplayName)) - safe to append" -ForegroundColor Green + + Write-LogHost "AppendFile mode: Appending to existing file: $OutputFile" -ForegroundColor Cyan +} + +Write-LogHost "=============================================" -ForegroundColor Cyan +Write-LogHost "" +if ($ExplodeDeep -and $ExplodeArrays) { Write-LogHost "Note: -ExplodeDeep takes precedence over -ExplodeArrays (arrays will still explode, plus deep flatten)." -ForegroundColor DarkYellow } +if ($ForcedRawInputCsvExplosion -and -not $ExplodeDeep -and -not $ExplodeArrays.IsPresent) { Write-LogHost "RAWInputCSV provided -> forcing Purview array explosion (non-exploded mode disabled)." -ForegroundColor Yellow } +if ($script:memoryFlushEnabled) { + $memSource = if ($MaxMemoryMB -eq -1) { "auto-detected" } else { "user-specified" } + Write-LogHost "Memory management: $($script:ResolvedMaxMemoryMB)MB limit ($memSource) - will flush to disk when exceeded" -ForegroundColor Cyan + Write-LogHost " Note: Not compatible with explosion modes (-ExplodeDeep/-ExplodeArrays) - those modes require in-memory processing." -ForegroundColor DarkGray +} elseif ($script:ResolvedMaxMemoryMB -gt 0 -and ($ExplodeDeep -or $ExplodeArrays -or $ForcedRawInputCsvExplosion)) { + Write-LogHost "Note: Memory limit ($($script:ResolvedMaxMemoryMB)MB) ignored because explosion mode is active" -ForegroundColor DarkYellow +} + +if ($RAWInputCSV) { + # Build snapshot then optionally inject EntraUsersOutput immediately after OutputFile + $paramSnapshot = [ordered]@{ + Mode = $scriptMode + RAWInputCSV = $RAWInputCSV + 'StartDate (inclusive)' = $StartDate + 'EndDate (exclusive)' = $EndDate + ActivityTypes = ($ActivityTypes -join ';') + ExcludeCopilotInteraction = $ExcludeCopilotInteraction.IsPresent + ExplodeArrays = $ForcedRawInputCsvExplosion + ExplodeDeep = $ExplodeDeep.IsPresent + UseEOM = $UseEOM.IsPresent + MaxMemoryMB = $(if ($script:ResolvedMaxMemoryMB -eq 0) { 'Off' } else { "$($script:ResolvedMaxMemoryMB)MB" + $(if ($MaxMemoryMB -eq -1) { ' (auto)' } else { '' }) }) + StatusIntervalSeconds = $StatusIntervalSeconds + MaxPartitions = $MaxPartitions + ResultSize = $ResultSize + PacingMs = $PacingMs + ExportWorkbook = $ExportWorkbook.IsPresent + CombineOutput = $CombineOutput.IsPresent + AppendFile = $(if ($AppendFile) { $AppendFile } else { '' }) + Force = $Force.IsPresent + SkipDiagnostics = $SkipDiagnostics.IsPresent + AutoCompleteness = $AutoCompleteness.IsPresent + EmitMetricsJson = $EmitMetricsJson.IsPresent + MetricsPath = $(if ($MetricsPath) { $MetricsPath } else { '' }) + StreamingSchemaSample = $StreamingSchemaSample + StreamingChunkSize = $StreamingChunkSize + OutputFile = $displayOutputFile + LogFile = $displayLogFile + PSVersion = $PSVersionTable.PSVersion.ToString() + PSEdition = $PSVersionTable.PSEdition + HostName = $Host.Name + HostVersion = $(try { $Host.Version.ToString() } catch { '' }) + } + $copilotIncluded = $IncludeCopilotInteraction.IsPresent -or ($ActivityTypes -contains $copilotBaseActivityType) + $paramSnapshot['IncludeCopilotInteraction'] = $copilotIncluded + if ($IncludeUserInfo -and -not $UseEOM) { + $entraPath = if ($ExportWorkbook) { 'Workbook Tab: EntraUsers' } else { (Join-Path (Split-Path $OutputFile -Parent) "EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.csv") } + # Rebuild ordered snapshot with EntraUsersOutput immediately after OutputFile + $newSnap = [ordered]@{} + foreach ($k in $paramSnapshot.Keys) { + $newSnap[$k] = $paramSnapshot[$k] + if ($k -eq 'OutputFile') { $newSnap['EntraUsersOutput'] = $entraPath } + } + $paramSnapshot = $newSnap + } +} +else { + # Smart parameter snapshot: Show only applicable parameters for the chosen query mode + $paramSnapshot = [ordered]@{ + 'StartDate (inclusive)' = $StartDate + 'EndDate (exclusive)' = $EndDate + OutputFile = $displayOutputFile + LogFile = $displayLogFile + } + + # Authentication (both modes, but different usage) + if ($UseEOM) { + # EOM mode: Auth parameter controls connection + $paramSnapshot['Auth'] = $Auth + } else { + # Graph API mode: Auth parameter controls connection + $paramSnapshot['Auth'] = $Auth + # Capture AppRegistration context (without exposing secrets) + $paramSnapshot['TenantId'] = $(if ($TenantId) { $TenantId } elseif ($env:GRAPH_TENANT_ID) { '[GRAPH_TENANT_ID]' } else { '' }) + $paramSnapshot['ClientId'] = $(if ($ClientId) { $ClientId } elseif ($env:GRAPH_CLIENT_ID) { '[GRAPH_CLIENT_ID]' } else { '' }) + if ($PSBoundParameters.ContainsKey('ClientSecret') -or $env:GRAPH_CLIENT_SECRET) { + $paramSnapshot['ClientSecret'] = '[securestring provided]' + } + if ($PSBoundParameters.ContainsKey('ClientCertificateThumbprint')) { + $paramSnapshot['ClientCertificateThumbprint'] = $ClientCertificateThumbprint + } + if ($PSBoundParameters.ContainsKey('ClientCertificateStoreLocation')) { + $paramSnapshot['ClientCertificateStoreLocation'] = $ClientCertificateStoreLocation + } + if ($PSBoundParameters.ContainsKey('ClientCertificatePath')) { + $paramSnapshot['ClientCertificatePath'] = $ClientCertificatePath + } + if ($PSBoundParameters.ContainsKey('ClientCertificatePassword')) { + $paramSnapshot['ClientCertificatePassword'] = '[securestring provided]' + } + } + + # Query parameters specific to each mode + if ($UseEOM) { + # EOM-specific: Search-UnifiedAuditLog parameters + $paramSnapshot['BlockHours'] = $BlockHours + $paramSnapshot['MaxPartitions'] = $MaxPartitions + $paramSnapshot['ResultSize'] = $ResultSize + $paramSnapshot['PacingMs'] = $PacingMs + $paramSnapshot['StatusIntervalSeconds'] = $StatusIntervalSeconds + } else { + # Graph API-specific: Parallel processing parameters + $paramSnapshot['MaxConcurrency'] = $MaxConcurrency + $paramSnapshot['ParallelMode'] = $ParallelMode + $paramSnapshot['MaxParallelGroups'] = $MaxParallelGroups + $paramSnapshot['IncludeUserInfo'] = $IncludeUserInfo.IsPresent + $paramSnapshot['OnlyUserInfo'] = $OnlyUserInfo.IsPresent + $paramSnapshot['MaxNetworkOutageMinutes'] = $MaxNetworkOutageMinutes + $paramSnapshot['PartitionHours'] = if ($PartitionHours -gt 0) { $PartitionHours } else { 'auto' } + $paramSnapshot['MaxPartitions'] = $MaxPartitions + $paramSnapshot['ResultSize'] = $ResultSize + $paramSnapshot['PacingMs'] = $PacingMs + $paramSnapshot['MaxMemoryMB'] = $(if ($script:ResolvedMaxMemoryMB -eq 0) { 'Off' } else { "$($script:ResolvedMaxMemoryMB)MB" + $(if ($MaxMemoryMB -eq -1) { ' (auto)' } else { '' }) }) + $paramSnapshot['StatusIntervalSeconds'] = $StatusIntervalSeconds + } + + # Common toggles and output options + $paramSnapshot['UseEOM'] = $UseEOM.IsPresent + $paramSnapshot['ExportWorkbook'] = $ExportWorkbook.IsPresent + $paramSnapshot['CombineOutput'] = $CombineOutput.IsPresent + $paramSnapshot['AppendFile'] = $(if ($AppendFile) { $AppendFile } else { '' }) + $paramSnapshot['Force'] = $Force.IsPresent + $paramSnapshot['SkipDiagnostics'] = $SkipDiagnostics.IsPresent + $paramSnapshot['AutoCompleteness'] = $AutoCompleteness.IsPresent + $paramSnapshot['EmitMetricsJson'] = $EmitMetricsJson.IsPresent + $paramSnapshot['MetricsPath'] = $(if ($MetricsPath) { $MetricsPath } else { '' }) + $paramSnapshot['StreamingSchemaSample'] = $StreamingSchemaSample + $paramSnapshot['StreamingChunkSize'] = $StreamingChunkSize + + # Common parameters (work in both modes) + $paramSnapshot['ActivityTypes'] = ($ActivityTypes -join ';') + $paramSnapshot['RecordTypes'] = $(if ($RecordTypes) { ($RecordTypes -join ';') } else { '' }) + $paramSnapshot['ServiceTypes'] = $(if ($ServiceTypes) { ($ServiceTypes -join ';') } else { '' }) + $copilotIncluded = $IncludeCopilotInteraction.IsPresent -or ($ActivityTypes -contains $copilotBaseActivityType) + $paramSnapshot['IncludeCopilotInteraction'] = $copilotIncluded + $paramSnapshot['IncludeM365Usage'] = $IncludeM365Usage.IsPresent + $paramSnapshot['IncludeDSPMForAI'] = $IncludeDSPMForAI.IsPresent + $paramSnapshot['ExcludeCopilotInteraction'] = $ExcludeCopilotInteraction.IsPresent + + # Post-processing filters (work in both modes - applied during/after explosion) + $paramSnapshot['AgentsOnly'] = $AgentsOnly.IsPresent + $paramSnapshot['AgentId'] = $(if ($AgentId) { ($AgentId -join ';') } else { '' }) + $paramSnapshot['ExcludeAgents'] = $ExcludeAgents.IsPresent + $paramSnapshot['UserId'] = $(if ($UserIds) { ($UserIds -join ';') } else { '' }) + + # GroupNames only works in live mode (requires auth for expansion) + if (-not $UseEOM -and $GroupNames) { + $paramSnapshot['GroupName'] = ($GroupNames -join ';') + } + + $paramSnapshot['PromptFilter'] = $(if ($PromptFilter) { $PromptFilter } else { '' }) + + # Output format parameters (work in both modes) + $paramSnapshot['ExplodeArrays'] = ($ExplodeArrays.IsPresent -or $ForcedRawInputCsvExplosion -or $ExplodeDeep.IsPresent) + $paramSnapshot['ExplodeDeep'] = $ExplodeDeep.IsPresent + $paramSnapshot['ExplosionThreads'] = $(if ($ExplosionThreads -eq 0) { 'auto' } else { $ExplosionThreads }) + + # Environment info + $paramSnapshot['PSVersion'] = $PSVersionTable.PSVersion.ToString() + $paramSnapshot['PSEdition'] = $PSVersionTable.PSEdition + $paramSnapshot['HostName'] = $Host.Name + $paramSnapshot['HostVersion'] = $(try { $Host.Version.ToString() } catch { '' }) + + # Entra users output reference (Graph mode only, inserted directly under OutputFile) + if ($IncludeUserInfo -and -not $UseEOM) { + $entraPath = if ($ExportWorkbook) { 'Workbook Tab: EntraUsers' } else { (Join-Path (Split-Path $OutputFile -Parent) "EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.csv") } + # Rebuild snapshot to inject after OutputFile + $newSnap = [ordered]@{} + foreach ($k in $paramSnapshot.Keys) { + $newSnap[$k] = $paramSnapshot[$k] + if ($k -eq 'OutputFile') { $newSnap['EntraUsersOutput'] = $entraPath } + } + $paramSnapshot = $newSnap + } +} +# Parameter Snapshot will be displayed after DSPM processing (see line ~4150) + +# Predeclare script-scope collections to satisfy StrictMode before first access +if (-not (Get-Variable -Name DeepExtraColumns -Scope Script -ErrorAction SilentlyContinue)) { $script:DeepExtraColumns = $null } + +<# +===================================================================== + Operational Logic +===================================================================== +#> + +function Find-AllArrays { + param( + $Data, + [string]$Path = '', + [int]$Depth = 0, + [hashtable]$Arrays + ) + if ($null -eq $Data) { return @{} } + if (-not $Arrays) { $Arrays = @{} } + if ($Depth -gt 6) { return $Arrays } + if ($null -eq $Data) { return $Arrays } + + $isArray = ($Data -is [System.Collections.IEnumerable] -and -not ($Data -is [string]) -and (($Data -is [System.Collections.IList]) -or $Data.GetType().IsArray)) + if ($isArray) { + $key = if ($Path) { $Path } else { 'root' } + if (-not $Arrays.ContainsKey($key)) { + $Arrays[$key] = [pscustomobject]@{ Path = $Path; Data = $Data; Count = ($Data | Measure-Object).Count } + } + } + + $props = $null + if ($Data -is [System.Management.Automation.PSObject]) { $props = $Data.PSObject.Properties } + elseif ($Data -is [System.Collections.IDictionary]) { $props = $Data.GetEnumerator() } + + if ($props) { + foreach ($p in $props) { + $name = if ($p -is [System.Collections.DictionaryEntry]) { $p.Key } else { $p.Name } + $val = if ($p -is [System.Collections.DictionaryEntry]) { $p.Value } else { $p.Value } + $childPath = if ($Path) { "$Path.$name" } else { $name } + Find-AllArrays -Data $val -Path $childPath -Depth ($Depth + 1) -Arrays $Arrays | Out-Null + } + } + # Note: Do NOT recurse into array elements - arrays are treated as terminal values + # that will be converted to JSON strings for predictable column names + return $Arrays +} + +function Test-ScalarValue { param($v) ($null -eq $v -or $v -is [string] -or $v -is [char] -or $v -is [bool] -or $v -is [int] -or $v -is [long] -or $v -is [double] -or $v -is [decimal] -or $v -is [float] -or $v -is [datetime] -or $v -is [guid]) } + +function Import-CsvToDataTable { + <# + .SYNOPSIS + Imports a CSV file directly into a System.Data.DataTable using fast .NET StreamReader. + + .DESCRIPTION + This is 10-50x faster than Import-Csv | ConvertTo-DataTable for large files because it: + 1. Uses .NET StreamReader instead of PowerShell's Import-Csv + 2. Avoids creating intermediate PSObjects + 3. Parses CSV directly into DataTable rows + + .PARAMETER Path + The path to the CSV file to import. + + .OUTPUTS + System.Data.DataTable + #> + param( + [Parameter(Mandatory = $true)] + [string]$Path + ) + + $dataTable = New-Object System.Data.DataTable + $reader = $null + + try { + $reader = New-Object System.IO.StreamReader($Path, [System.Text.Encoding]::UTF8) + $lineNum = 0 + $columns = @() + + while ($null -ne ($line = $reader.ReadLine())) { + # Parse CSV line (handles quoted fields with commas) + $fields = [System.Collections.Generic.List[string]]::new() + $field = [System.Text.StringBuilder]::new() + $inQuotes = $false + + for ($i = 0; $i -lt $line.Length; $i++) { + $c = $line[$i] + if ($c -eq '"') { + if ($inQuotes -and $i + 1 -lt $line.Length -and $line[$i + 1] -eq '"') { + [void]$field.Append('"') + $i++ + } else { + $inQuotes = -not $inQuotes + } + } elseif ($c -eq ',' -and -not $inQuotes) { + [void]$fields.Add($field.ToString()) + [void]$field.Clear() + } else { + [void]$field.Append($c) + } + } + [void]$fields.Add($field.ToString()) + + if ($lineNum -eq 0) { + # Header row - create columns + $columns = $fields.ToArray() + foreach ($col in $columns) { + [void]$dataTable.Columns.Add($col, [string]) + } + } else { + # Data row + $row = $dataTable.NewRow() + for ($j = 0; $j -lt [Math]::Min($columns.Count, $fields.Count); $j++) { + $val = $fields[$j] + $row[$j] = if ([string]::IsNullOrEmpty($val)) { [DBNull]::Value } else { $val } + } + [void]$dataTable.Rows.Add($row) + } + $lineNum++ + } + } + finally { + if ($reader) { $reader.Dispose() } + } + + return ,$dataTable +} + +function ConvertTo-DataTable { + <# + .SYNOPSIS + Converts an array of PSObjects to a System.Data.DataTable for high-performance Excel export. + + .DESCRIPTION + Export-Excel with piped PSObjects processes cells one-by-one (~400 cells/sec), which is extremely + slow for large datasets. Send-SQLDataToExcel with DataTable uses bulk insert and is 100-1000x faster. + This function converts PSObject arrays to DataTable format for use with Send-SQLDataToExcel. + + .PARAMETER InputObject + The array of PSObjects to convert to a DataTable. + + .OUTPUTS + System.Data.DataTable + #> + param( + [Parameter(Mandatory = $true, ValueFromPipeline = $true)] + [object[]]$InputObject + ) + + begin { + $dataTable = New-Object System.Data.DataTable + $isFirstRow = $true + $columns = @() + } + + process { + foreach ($obj in $InputObject) { + if ($isFirstRow) { + $columns = @($obj.PSObject.Properties.Name) + foreach ($colName in $columns) { + [void]$dataTable.Columns.Add($colName, [string]) + } + $isFirstRow = $false + } + + $row = $dataTable.NewRow() + foreach ($colName in $columns) { + $val = $obj.$colName + $row[$colName] = if ($null -eq $val) { [DBNull]::Value } else { [string]$val } + } + [void]$dataTable.Rows.Add($row) + } + } + + end { + return ,$dataTable + } +} + +function Export-DataTableToExcel { + <# + .SYNOPSIS + High-performance Excel export using DataTable bulk insert method. + + .DESCRIPTION + Wrapper function that converts PSObjects to DataTable and exports using Send-SQLDataToExcel. + This is 100-1000x faster than piping to Export-Excel for large datasets. + + .PARAMETER Data + The array of PSObjects to export. + + .PARAMETER Path + The path to the Excel file. + + .PARAMETER WorksheetName + The name of the worksheet/tab. + #> + param( + [Parameter(Mandatory = $true)] + [object[]]$Data, + + [Parameter(Mandatory = $true)] + [string]$Path, + + [Parameter(Mandatory = $true)] + [string]$WorksheetName + ) + + $dataTable = $Data | ConvertTo-DataTable + Send-SQLDataToExcel -DataTable $dataTable -Path $Path -WorkSheetName $WorksheetName -Force -FreezeTopRow -BoldTopRow -AutoSize -NoNumberConversion '*' +} + +function ConvertTo-UniqueString { + param([object]$items, [char]$Sep = ';') + if ($null -eq $items) { return $null } + $set = New-Object System.Collections.Generic.HashSet[string] + foreach ($v in $items) { if ($null -ne $v -and $v -ne '') { [void]$set.Add([string]$v) } } + ([string]::Join($Sep, $set)) +} + +function ConvertTo-FlatColumns { + param([object]$Node, [string]$Prefix = '', [int]$MaxDepth = 60) + $cols = @{} + function Recurse([object]$n, [string]$p, [int]$d) { + if ($d -gt $MaxDepth) { return } + if ($null -eq $n) { if ($p) { $cols[$p.TrimEnd('.')] = $null }; return } + if (Test-ScalarValue $n) { if ($p) { $cols[$p.TrimEnd('.')] = $n }; return } + if ($n -is [System.Collections.IEnumerable] -and -not ($n -is [string]) -and -not ($n -is [System.Collections.IDictionary])) { + # Smart array handling: single-element arrays recurse without index, multi-element become JSON + $arr = @($n) + if ($arr.Count -eq 1) { + # Single element: recurse into it without adding index to path (clean column names) + Recurse -n $arr[0] -p $p -d ($d + 1) + } elseif ($arr.Count -gt 1) { + # Multiple elements: serialize to JSON (row explosion handles important arrays separately) + if ($p) { + try { $cols[$p.TrimEnd('.')] = ($n | ConvertTo-Json -Depth 10 -Compress -ErrorAction SilentlyContinue) } + catch { $cols[$p.TrimEnd('.')] = '' } + } + } else { + # Empty array + if ($p) { $cols[$p.TrimEnd('.')] = '' } + } + return + } + $props = $null; try { $props = $n.PSObject.Properties } catch {} + if ($props) { + foreach ($prop in $props) { $name = [string]$prop.Name; $child = $prop.Value; $cp = if ($p) { $p + $name + '.' } else { $name + '.' }; Recurse -n $child -p $cp -d ($d + 1) } + } + } + Recurse -n $Node -p $Prefix -d 0 + return $cols +} + +function To-RecordArray { + param($records) + $result = @() + if ($null -eq $records) { return $result } + $isEnumerable = ($records -is [System.Collections.IEnumerable]) + $isScalarish = ($records -is [string] -or $records -is [System.Management.Automation.PSObject] -or $records -is [System.Management.Automation.PSCustomObject]) + if ($isEnumerable -and -not $isScalarish) { + foreach ($r in $records) { $result += ,$r } + } + else { + $result += ,$records + } + return $result +} + +try { + $scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path + $schemaHelper = Join-Path $scriptDir "..\..\scripts\lib\M365UsageSchema.ps1" + if (-not (Test-Path $schemaHelper)) { $schemaHelper = Join-Path $scriptDir "M365UsageSchema.ps1" } + if (Test-Path $schemaHelper) { . $schemaHelper } +} catch {} + +function Invoke-ReplayInlineExport { + param( + [Parameter(Mandatory)] [System.Collections.IEnumerable]$Logs + ) + Write-LogHost "Replay inline export starting..." -ForegroundColor Magenta + $exportTemp = Join-Path ([System.IO.Path]::GetTempPath()) ("pax_export_" + [guid]::NewGuid().ToString() + ".tmp") + # Fixed 153-column M code schema (matches live explosion output exactly) + $columnOrder = $PurviewExplodedHeader + Open-CsvWriter -Path $exportTemp -Columns $columnOrder + $total = 0 + $idx = 0 + $errCount = 0 + $errLimit = 25 + foreach ($log in $Logs) { + $idx++ + if ($idx % 5000 -eq 0) { Write-LogHost ("Replay inline progress: {0} records" -f $idx) -ForegroundColor DarkGray } + try { + $records = Convert-ToPurviewExplodedRecords -Record $log -Deep:$ExplodeDeep -PromptFilterValue $PromptFilter + $recordsArr = To-RecordArray $records + if ($recordsArr.Count -gt 0) { + $total += $recordsArr.Count + $emitSet = $recordsArr | ForEach-Object { $_ | Select-Object -Property $columnOrder } + $rowsOut = @($emitSet) + if ($rowsOut.Count -gt 0) { Write-CsvRows -Rows $rowsOut -Columns $columnOrder } + } + } catch { + $errCount++ + } + } + try { Close-CsvWriter } catch {} + try { Move-Item -Force -Path $exportTemp -Destination $OutputFile } catch {} + try { $script:metrics.TotalStructuredRows = $total } catch {} + Write-LogHost ("Replay inline export complete: {0} rows" -f $total) -ForegroundColor Green + + # Explosion summary for replay mode + Write-LogHost "" + Write-LogHost "=== REPLAY EXPLOSION SUMMARY ===" -ForegroundColor Cyan + Write-LogHost (" Input records: {0:N0}" -f $idx) -ForegroundColor White + Write-LogHost (" Output rows: {0:N0}" -f $total) -ForegroundColor White + if ($total -gt $idx) { + $explosionRatio = [Math]::Round($total / $idx, 2) + Write-LogHost (" Expansion: {0}x ({1:N0} additional rows from array explosion)" -f $explosionRatio, ($total - $idx)) -ForegroundColor Green + } elseif ($total -eq $idx) { + Write-LogHost " Expansion: 1:1 (no arrays exploded)" -ForegroundColor Yellow + } else { + Write-LogHost (" Reduction: {0:N0} records filtered out" -f ($idx - $total)) -ForegroundColor DarkYellow + } + if ($errCount -gt 0) { + Write-LogHost (" Errors: {0:N0} record(s) failed to process" -f $errCount) -ForegroundColor Red + } + Write-LogHost (" Output file: {0}" -f $OutputFile) -ForegroundColor Gray + Write-LogHost "" +} + +function Get-SafeProperty { param($obj, [string]$name) try { if ($null -ne $obj -and $obj.PSObject.Properties[$name]) { return $obj.($name) } } catch {}; return $null } + +# --- Purview Exploded Schema (153 columns — matches M code #"Changed Type" step exactly) --- +$PurviewExplodedHeader = @( + 'RecordId', 'CreationDate', 'RecordType', 'Operation', 'UserId', + 'AssociatedAdminUnits', 'AssociatedAdminUnitsNames', + '@odata.type', 'CreationTime', 'Id', 'OrganizationId', + 'ResultStatus', 'UserKey', 'UserType', 'Version', 'Workload', + 'ClientIP', 'ObjectId', 'AzureActiveDirectoryEventType', + 'ActorContextId', 'ActorIpAddress', 'InterSystemsId', 'IntraSystemId', + 'SupportTicketId', 'TargetContextId', 'ApplicationId', + 'DeviceProperties.OS', 'DeviceProperties.BrowserType', + 'ErrorNumber', + 'SiteUrl', 'SourceRelativeUrl', 'SourceFileName', 'SourceFileExtension', + 'ListId', 'ListItemUniqueId', 'WebId', 'ApplicationDisplayName', 'EventSource', + 'ItemType', 'SiteSensitivityLabelId', 'GeoLocation', 'IsManagedDevice', + 'DeviceDisplayName', 'ListBaseType', 'ListServerTemplate', + 'AuthenticationType', 'Site', 'DoNotDistributeEvent', 'HighPriorityMediaProcessing', + 'BrowserName', 'BrowserVersion', 'CorrelationId', 'Platform', 'UserAgent', + 'ActorInfoString', 'AppId', 'AuthType', 'ClientAppId', 'ClientIPAddress', + 'ClientInfoString', 'ExternalAccess', 'InternalLogonType', 'LogonType', + 'LogonUserSid', 'MailboxGuid', 'MailboxOwnerSid', 'MailboxOwnerUPN', + 'OrganizationName', 'OriginatingServer', 'SessionId', + 'TokenObjectId', 'TokenTenantId', 'TokenType', 'SaveToSentItems', + 'OperationCount', 'FileSizeBytes', + 'MeetingId', 'MeetingType', 'EventSignature', 'EventData', + 'Permission', 'SensitivityLabelId', 'SharingLinkScope', + 'TargetUserOrGroupType', 'TargetUserOrGroupName', + 'MeetingURL', 'ChatId', 'MessageId', 'MessageSizeInBytes', 'MessageType', + 'FormId', 'FormName', 'VideoId', 'VideoName', 'ChannelId', 'ViewDuration', + 'ClientRegion', 'CopilotLogVersion', 'TargetId', + 'TeamName', 'TeamGuid', 'ResponseId', 'IsAnonymous', 'DeviceType', + 'ChannelName', 'ChannelGuid', 'ChannelType', 'AppName', 'EnvironmentName', + 'PlanId', 'PlanName', 'TaskId', 'TaskName', 'PercentComplete', + 'CrossMailboxOperation', + 'RecordTypeNum', 'ResultStatus_Audit', + 'ModelId', 'ModelProvider', 'ModelFamily', + 'TokensTotal', 'TokensInput', 'TokensOutput', 'DurationMs', 'OutcomeStatus', + 'ConversationId', 'TurnNumber', 'RetryCount', 'ClientVersion', 'ClientPlatform', + 'AgentId', 'AgentName', 'AgentVersion', 'AgentCategory', 'ApplicationName', + 'AppHost', 'ThreadId', + 'Context_Id', 'Context_Type', + 'Message_Id', 'Message_isPrompt', + 'AccessedResource_Action', 'AccessedResource_PolicyDetails', 'AccessedResource_SiteUrl', + 'AISystemPlugin_Id', 'AISystemPlugin_Name', + 'ModelTransparencyDetails_ModelName', 'MessageIds', + 'AccessedResource_Name', 'AccessedResource_SensitivityLabel', + 'AccessedResource_ResourceType', 'SensitivityLabel', 'Context_Item' +) + +# --- M365 Usage Base Header --- +$M365UsageBaseHeader = @( + 'RecordId','CreationDate','RecordType','Operation','UserId','AuditData','AssociatedAdminUnits','AssociatedAdminUnitsNames','CreationTime','Id','OrganizationId','ResultStatus','UserKey','UserType','Version','Workload','ClientIP','ObjectId','AzureActiveDirectoryEventType','ExtendedProperties','ExtendedProperties.ResultStatusDetail','ExtendedProperties.Name','ExtendedProperties.Value','ExtendedProperties.UserAgent','ExtendedProperties.RequestType','ModifiedProperties','Actor','Actor.ID','Actor.Type','ActorContextId','ActorIpAddress','InterSystemsId','IntraSystemId','SupportTicketId','Target','Target.ID','Target.Type','TargetContextId','ApplicationId','DeviceProperties','DeviceProperties.OS','DeviceProperties.Name','DeviceProperties.Value','DeviceProperties.BrowserType','DeviceProperties.SessionId','ErrorNumber','ExtendedProperties.KeepMeSignedIn','DeviceProperties.Id','DeviceProperties.DisplayName','DeviceProperties.TrustType','ExtendedProperties.UserAuthenticationMethod','DeviceProperties.IsCompliant','DeviceProperties.IsCompliantAndManaged', + # SharePoint / OneDrive + 'SiteUrl','SourceRelativeUrl','SourceFileName','SourceFileExtension','ListId','ListItemUniqueId','WebId','ApplicationDisplayName','EventSource','ItemType','SiteSensitivityLabelId','GeoLocation','IsManagedDevice','DeviceDisplayName','ListBaseType','ListServerTemplate','AuthenticationType','Site','DoNotDistributeEvent','HighPriorityMediaProcessing', + # App Access Context + 'AppAccessContext.ClientAppId','AppAccessContext.ClientAppName','AppAccessContext.CorrelationId','AppAccessContext.AADSessionId','AppAccessContext.UniqueTokenId','AppAccessContext.AuthTime','AppAccessContext.TokenIssuedAtTime','AppAccessContext.UserObjectId','AppAccessContext.DeviceId' +) + +# --- Unified Replay Header (auto-detects all activity types) --- +# Scans input CSV to detect columns from any record type, merges with PurviewExplodedHeader for Copilot +# Skips CopilotEventData.* paths since explosion produces flat column names +function Get-UnifiedReplayHeader { + param( + [Parameter(Mandatory)][string]$RawCsvPath, + [int]$Sample = 500 + ) + # Base columns common to all activity types + $base = @('RecordId','CreationDate','RecordType','Operation','UserId','AuditData','AssociatedAdminUnits','AssociatedAdminUnitsNames','CreationTime','Id','OrganizationId','ResultStatus','UserKey','UserType','Version','Workload','ClientIP','ObjectId','AzureActiveDirectoryEventType','ExtendedProperties','ExtendedProperties.ResultStatusDetail','ExtendedProperties.Name','ExtendedProperties.Value','ExtendedProperties.UserAgent','ExtendedProperties.RequestType','ModifiedProperties','Actor','Actor.ID','Actor.Type','ActorContextId','ActorIpAddress','InterSystemsId','IntraSystemId','SupportTicketId','Target','Target.ID','Target.Type','TargetContextId','ApplicationId','DeviceProperties','DeviceProperties.OS','DeviceProperties.Name','DeviceProperties.Value','DeviceProperties.BrowserType','DeviceProperties.SessionId','ErrorNumber','ExtendedProperties.KeepMeSignedIn','DeviceProperties.Id','DeviceProperties.DisplayName','DeviceProperties.TrustType','ExtendedProperties.UserAuthenticationMethod','DeviceProperties.IsCompliant','DeviceProperties.IsCompliantAndManaged') + $aug = @( + 'SiteUrl','SourceRelativeUrl','SourceFileName','SourceFileExtension','ListId','ListItemUniqueId','WebId','ApplicationDisplayName','EventSource','ItemType','SiteSensitivityLabelId','GeoLocation','IsManagedDevice','DeviceDisplayName','ListBaseType','ListServerTemplate','AuthenticationType','Site','DoNotDistributeEvent','HighPriorityMediaProcessing', + 'AppAccessContext.ClientAppId','AppAccessContext.ClientAppName','AppAccessContext.CorrelationId','AppAccessContext.AADSessionId','AppAccessContext.UniqueTokenId','AppAccessContext.AuthTime','AppAccessContext.TokenIssuedAtTime','AppAccessContext.UserObjectId','AppAccessContext.DeviceId','AppAccessContext.@odata.type','AppAccessContext.APIId','AppAccessContext.IssuedAtTime' + ) + $detected = New-Object System.Collections.Generic.List[string] + $hasCopilot = $false + + # Recursively detect column paths from JSON, skipping CopilotEventData (handled by explosion with flat names) + function Add-Paths([object]$node, [string]$prefix, [System.Collections.Generic.List[string]]$collector) { + if ($null -eq $node) { return } + if (Test-ScalarValue $node) { if ($prefix) { $collector.Add($prefix) | Out-Null }; return } + if ($node -is [System.Collections.IEnumerable] -and $node -isnot [string]) { + foreach ($item in $node) { Add-Paths $item $prefix $collector } + return + } + if ($node.PSObject -and $node.PSObject.Properties) { + foreach ($prop in $node.PSObject.Properties) { + $pn = $prop.Name; $pv = $prop.Value + $path = if ($prefix) { "$prefix.$pn" } else { $pn } + # SKIP CopilotEventData - explosion handles these with flat column names + if ($pn -eq 'CopilotEventData') { continue } + # Special handling for Name/Value arrays (pivot into columns) + if ($pn -eq 'ExtendedProperties' -and $pv -is [System.Collections.IEnumerable]) { + foreach ($item in $pv) { try { if ($item.Name) { $collector.Add("ExtendedProperties.$($item.Name)") | Out-Null } } catch {} } + continue + } + if ($pn -eq 'DeviceProperties' -and $pv -is [System.Collections.IEnumerable]) { + foreach ($item in $pv) { try { if ($item.Name) { $collector.Add("DeviceProperties.$($item.Name)") | Out-Null } } catch {} } + continue + } + Add-Paths $pv $path $collector + } + } + } + if ($RawCsvPath -and (Test-Path $RawCsvPath)) { + try { + $rows = Import-Csv $RawCsvPath | Select-Object -First $Sample + foreach ($r in $rows) { + try { + $audit = $r.AuditData | ConvertFrom-Json -ErrorAction Stop + if ($audit) { + # Detect if any Copilot records exist + if ($audit.CopilotEventData) { $hasCopilot = $true } + Add-Paths $audit '' $detected + } + } catch {} + } + } catch {} + } + # Build unified header: base + augmented + detected (non-Copilot) + PurviewExplodedHeader (flat Copilot columns) + $header = New-Object System.Collections.Generic.List[string] + foreach ($c in $base) { if (-not $header.Contains($c)) { $header.Add($c) } } + foreach ($c in $aug) { if (-not $header.Contains($c)) { $header.Add($c) } } + foreach ($c in $detected) { if (-not $header.Contains($c)) { $header.Add($c) } } + # Always include flat Copilot columns from PurviewExplodedHeader (supports all activity types) + foreach ($c in $PurviewExplodedHeader) { if (-not $header.Contains($c)) { $header.Add($c) } } + try { + if ($RawCsvPath) { + $hdrPath = Join-Path (Split-Path $RawCsvPath -Parent) 'UnifiedReplayHeader.txt' + $header | Set-Content -Path $hdrPath -Encoding utf8 + } + } catch {} + return $header +} + +# --- Legacy M365 Usage Wide Header (kept for backward compatibility) --- +function Get-M365UsageWideHeader { + param( + [string]$RawCsvPath, + [int]$Sample = 500 + ) + # Delegate to unified header function + return Get-UnifiedReplayHeader -RawCsvPath $RawCsvPath -Sample $Sample +} + +# --- Entra Users Schema (47 columns) --- +# 30 core + 5 manager + 2 license columns + 10 Power BI template compatibility columns +$EntraUsersHeader = @( + 'userPrincipalName','DisplayName','id','Email','givenName','surname','JobTitle','department','employeeType','employeeId','employeeHireDate', + 'officeLocation','city','state','Country','postalCode','companyName','employeeOrgData_division','employeeOrgData_costCenter', + 'accountEnabled','userType','createdDateTime','usageLocation','preferredLanguage','onPremisesSyncEnabled','onPremisesImmutableId','externalUserState', + 'proxyAddresses_Primary','proxyAddresses_Count','proxyAddresses_All', + 'manager_id','manager_displayName','manager_userPrincipalName','manager_mail','manager_jobTitle', + 'assignedLicenses','HasLicense', + # Power BI template compatibility columns (alias mappings) + 'ManagerID','BusinessAreaLabel','CountryofEmployment','CompanyCodeLabel','CostCentreLabel','UserName', + # Power BI template compatibility columns (null placeholders for Viva Insights fields) + 'EffectiveDate','FunctionType','BusinessAreaCode','OrgLevel_3Label' +) + +function Test-EntraUsersSchema { + param( + [Parameter(Mandatory=$true)][array]$Users, + [switch]$Quiet + ) + if (-not $Users -or $Users.Count -eq 0) { return } + $expected = $EntraUsersHeader + $actual = $Users[0].PSObject.Properties.Name + $missing = @(); foreach ($c in $expected) { if ($c -notin $actual) { $missing += $c } } + $extra = @(); foreach ($c in $actual) { if ($c -notin $expected) { $extra += $c } } + if ($missing.Count -gt 0 -or $extra.Count -gt 0) { + Write-LogHost ("WARNING: EntraUsers schema mismatch. Missing: {0}; Extra: {1}" -f ($missing -join ', '), ($extra -join ', ')) -ForegroundColor Yellow + } elseif (-not $Quiet) { + Write-LogHost "Validated EntraUsers schema ($($expected.Count) columns)." -ForegroundColor DarkGray + } +} + +# ═══════════════════════════════════════════════════════════════════════════════ +# FAST ROW CREATION HELPER +# Converts a hashtable to PSCustomObject in a single operation (avoids Add-Member overhead) +# Used by explosion logic to build rows efficiently via hashtable accumulation +# ═══════════════════════════════════════════════════════════════════════════════ +function New-FastRow { + <# + .SYNOPSIS + Creates a PSCustomObject from a hashtable in a single operation. + .DESCRIPTION + Builds a row by accumulating properties in a hashtable first, then converting + to PSCustomObject once. This is significantly faster than repeated Add-Member calls. + .PARAMETER Properties + A hashtable containing property names and values for the new object. + .EXAMPLE + $props = @{ Name = 'Test'; Value = 123 } + $row = New-FastRow -Properties $props + #> + [CmdletBinding()] + param([Parameter(Mandatory)][hashtable]$Properties) + return [PSCustomObject]$Properties +} + +$existingDeep = Get-Variable -Name DeepExtraColumns -Scope Script -ErrorAction SilentlyContinue +if (-not $existingDeep -or -not $script:DeepExtraColumns) { $script:DeepExtraColumns = New-Object System.Collections.Generic.List[string] } + +function Convert-ToPurviewExplodedRecords { + [CmdletBinding()] + param( + [Parameter(Mandatory)] $Record, + [switch]$Deep, + [switch]$PartialExplode, # NEW: Prompt-specific explosion only (preserves AuditData) + [string]$PromptFilterValue, + [switch]$SkipMetrics # Used by parallel replay to defer metrics aggregation to parent thread + ) + try { + $auditData = if ($Record.PSObject.Properties['_ParsedAuditData']) { $Record._ParsedAuditData } else { try { $Record.AuditData | ConvertFrom-Json -ErrorAction Stop } catch { $null } } + if (-not $auditData) { + if (-not $SkipMetrics) { + $script:metrics.FilteringSkippedRecords++ + $script:metrics.FilteringMissingAuditData++ + } + return @() + } + try { Profile-AuditData $auditData } catch {} + # Ensure helper is available when re-entering session (PS alias scoping) + if (-not (Get-Command Find-AllArrays -ErrorAction SilentlyContinue)) { Set-Alias -Name Find-AllArrays -Value Find-AllArrays -ErrorAction SilentlyContinue | Out-Null } + + $ced = Get-SafeProperty $auditData 'CopilotEventData' + if (-not $ced) { + # ── M code-aligned non-Copilot path: fixed 153-column extraction (no dynamic discovery) ── + # Produces exactly 1 row per record with all 153 M code columns populated from AuditData. + # No array explosion for non-Copilot records (matches M code behaviour). + # DeviceProperties NV-pivot: only .OS and .BrowserType (matches M code GetNVProp). + $recordId = if ($Record.RecordId) { $Record.RecordId } elseif ($Record.Identity) { $Record.Identity } elseif ($Record.Id) { $Record.Id } else { $auditData.Id } + $creationDate = script:Format-DatePurviewFast $Record.CreationDate + $creationTime = try { script:Format-DatePurviewFast $auditData.CreationTime } catch { '' } + $opValue = try { $auditData.Operation } catch { if ($Record.Operation) { $Record.Operation } else { $Record.Operations } } + $uidValue = try { $auditData.UserId } catch { if ($Record.UserId) { $Record.UserId } elseif ($Record.UserIds) { $Record.UserIds } else { '' } } + $recordType = $Record.RecordType + $resultStatus = Get-SafeProperty $auditData 'ResultStatus' + $recordTypeNum = try { [int]$recordType } catch { $recordType } + $applicationId = Select-FirstNonNull -Values @((Get-SafeProperty $auditData 'ApplicationId'), (Get-SafeProperty $auditData 'AppId'), (Get-SafeProperty $auditData 'ClientAppId')) + # DeviceProperties NV-pivot: only .OS and .BrowserType (matches M code GetNVProp) + $devProps = Get-SafeProperty $auditData 'DeviceProperties' + $dpOS = ''; $dpBrowser = '' + if ($devProps -and ($devProps -is [System.Collections.IEnumerable])) { + foreach ($dp in $devProps) { + try { + if ($dp.Name -eq 'OS') { $dpOS = $dp.Value } + elseif ($dp.Name -eq 'BrowserType') { $dpBrowser = $dp.Value } + } catch {} + } + } + # AgentCategory + $agentIdVal = Get-SafeProperty $auditData 'AgentId' + $agentCat = '' + if ($agentIdVal) { + if ($agentIdVal -like "CopilotStudio.Declarative.*") { $agentCat = "Declarative Agent" } + elseif ($agentIdVal -like "CopilotStudio.CustomEngine.*") { $agentCat = "Custom Engine Agent" } + elseif ($agentIdVal -like "P_*") { $agentCat = "Declarative Agent (Purview)" } + else { $agentCat = "Other Agent" } + } + $rowObj = [PSCustomObject][ordered]@{ + RecordId = $recordId + CreationDate = $creationDate + RecordType = $recordType + Operation = $opValue + UserId = $uidValue + AssociatedAdminUnits = $(try { if ($Record.AssociatedAdminUnits) { $Record.AssociatedAdminUnits } elseif ($auditData.AssociatedAdminUnits) { $auditData.AssociatedAdminUnits } else { '' } } catch { '' }) + AssociatedAdminUnitsNames = $(try { if ($Record.AssociatedAdminUnitsNames) { $Record.AssociatedAdminUnitsNames } elseif ($auditData.AssociatedAdminUnitsNames) { $auditData.AssociatedAdminUnitsNames } else { '' } } catch { '' }) + '@odata.type' = (Get-SafeProperty $auditData '@odata.type') + CreationTime = $creationTime + Id = (Get-SafeProperty $auditData 'Id') + OrganizationId = (Get-SafeProperty $auditData 'OrganizationId') + ResultStatus = $resultStatus + UserKey = (Get-SafeProperty $auditData 'UserKey') + UserType = (Get-SafeProperty $auditData 'UserType') + Version = (Get-SafeProperty $auditData 'Version') + Workload = (Get-SafeProperty $auditData 'Workload') + ClientIP = (Get-SafeProperty $auditData 'ClientIP') + ObjectId = (Get-SafeProperty $auditData 'ObjectId') + AzureActiveDirectoryEventType = (Get-SafeProperty $auditData 'AzureActiveDirectoryEventType') + ActorContextId = (Get-SafeProperty $auditData 'ActorContextId') + ActorIpAddress = (Get-SafeProperty $auditData 'ActorIpAddress') + InterSystemsId = (Get-SafeProperty $auditData 'InterSystemsId') + IntraSystemId = (Get-SafeProperty $auditData 'IntraSystemId') + SupportTicketId = (Get-SafeProperty $auditData 'SupportTicketId') + TargetContextId = (Get-SafeProperty $auditData 'TargetContextId') + ApplicationId = $applicationId + 'DeviceProperties.OS' = $dpOS + 'DeviceProperties.BrowserType' = $dpBrowser + ErrorNumber = (Get-SafeProperty $auditData 'ErrorNumber') + SiteUrl = (Get-SafeProperty $auditData 'SiteUrl') + SourceRelativeUrl = (Get-SafeProperty $auditData 'SourceRelativeUrl') + SourceFileName = (Get-SafeProperty $auditData 'SourceFileName') + SourceFileExtension = (Get-SafeProperty $auditData 'SourceFileExtension') + ListId = (Get-SafeProperty $auditData 'ListId') + ListItemUniqueId = (Get-SafeProperty $auditData 'ListItemUniqueId') + WebId = (Get-SafeProperty $auditData 'WebId') + ApplicationDisplayName = (Get-SafeProperty $auditData 'ApplicationDisplayName') + EventSource = (Get-SafeProperty $auditData 'EventSource') + ItemType = (Get-SafeProperty $auditData 'ItemType') + SiteSensitivityLabelId = (Get-SafeProperty $auditData 'SiteSensitivityLabelId') + GeoLocation = (Get-SafeProperty $auditData 'GeoLocation') + IsManagedDevice = (Get-SafeProperty $auditData 'IsManagedDevice') + DeviceDisplayName = (Get-SafeProperty $auditData 'DeviceDisplayName') + ListBaseType = (Get-SafeProperty $auditData 'ListBaseType') + ListServerTemplate = (Get-SafeProperty $auditData 'ListServerTemplate') + AuthenticationType = (Get-SafeProperty $auditData 'AuthenticationType') + Site = (Get-SafeProperty $auditData 'Site') + DoNotDistributeEvent = (Get-SafeProperty $auditData 'DoNotDistributeEvent') + HighPriorityMediaProcessing = (Get-SafeProperty $auditData 'HighPriorityMediaProcessing') + BrowserName = (Get-SafeProperty $auditData 'BrowserName') + BrowserVersion = (Get-SafeProperty $auditData 'BrowserVersion') + CorrelationId = (Get-SafeProperty $auditData 'CorrelationId') + Platform = (Get-SafeProperty $auditData 'Platform') + UserAgent = (Get-SafeProperty $auditData 'UserAgent') + ActorInfoString = (Get-SafeProperty $auditData 'ActorInfoString') + AppId = (Get-SafeProperty $auditData 'AppId') + AuthType = (Get-SafeProperty $auditData 'AuthType') + ClientAppId = (Get-SafeProperty $auditData 'ClientAppId') + ClientIPAddress = (Get-SafeProperty $auditData 'ClientIPAddress') + ClientInfoString = (Get-SafeProperty $auditData 'ClientInfoString') + ExternalAccess = (Get-SafeProperty $auditData 'ExternalAccess') + InternalLogonType = (Get-SafeProperty $auditData 'InternalLogonType') + LogonType = (Get-SafeProperty $auditData 'LogonType') + LogonUserSid = (Get-SafeProperty $auditData 'LogonUserSid') + MailboxGuid = (Get-SafeProperty $auditData 'MailboxGuid') + MailboxOwnerSid = (Get-SafeProperty $auditData 'MailboxOwnerSid') + MailboxOwnerUPN = (Get-SafeProperty $auditData 'MailboxOwnerUPN') + OrganizationName = (Get-SafeProperty $auditData 'OrganizationName') + OriginatingServer = (Get-SafeProperty $auditData 'OriginatingServer') + SessionId = (Get-SafeProperty $auditData 'SessionId') + TokenObjectId = (Get-SafeProperty $auditData 'TokenObjectId') + TokenTenantId = (Get-SafeProperty $auditData 'TokenTenantId') + TokenType = (Get-SafeProperty $auditData 'TokenType') + SaveToSentItems = (Get-SafeProperty $auditData 'SaveToSentItems') + OperationCount = (Get-SafeProperty $auditData 'OperationCount') + FileSizeBytes = (Get-SafeProperty $auditData 'FileSizeBytes') + MeetingId = (Get-SafeProperty $auditData 'MeetingId') + MeetingType = (Get-SafeProperty $auditData 'MeetingType') + EventSignature = (Get-SafeProperty $auditData 'EventSignature') + EventData = (Get-SafeProperty $auditData 'EventData') + Permission = (Get-SafeProperty $auditData 'Permission') + SensitivityLabelId = (Get-SafeProperty $auditData 'SensitivityLabelId') + SharingLinkScope = (Get-SafeProperty $auditData 'SharingLinkScope') + TargetUserOrGroupType = (Get-SafeProperty $auditData 'TargetUserOrGroupType') + TargetUserOrGroupName = (Get-SafeProperty $auditData 'TargetUserOrGroupName') + MeetingURL = (Get-SafeProperty $auditData 'MeetingURL') + ChatId = (Get-SafeProperty $auditData 'ChatId') + MessageId = (Get-SafeProperty $auditData 'MessageId') + MessageSizeInBytes = (Get-SafeProperty $auditData 'MessageSizeInBytes') + MessageType = (Get-SafeProperty $auditData 'MessageType') + FormId = (Get-SafeProperty $auditData 'FormId') + FormName = (Get-SafeProperty $auditData 'FormName') + VideoId = (Get-SafeProperty $auditData 'VideoId') + VideoName = (Get-SafeProperty $auditData 'VideoName') + ChannelId = (Get-SafeProperty $auditData 'ChannelId') + ViewDuration = (Get-SafeProperty $auditData 'ViewDuration') + ClientRegion = (Get-SafeProperty $auditData 'ClientRegion') + CopilotLogVersion = (Get-SafeProperty $auditData 'CopilotLogVersion') + TargetId = (Get-SafeProperty $auditData 'TargetId') + TeamName = (Get-SafeProperty $auditData 'TeamName') + TeamGuid = (Get-SafeProperty $auditData 'TeamGuid') + ResponseId = (Get-SafeProperty $auditData 'ResponseId') + IsAnonymous = (Get-SafeProperty $auditData 'IsAnonymous') + DeviceType = (Get-SafeProperty $auditData 'DeviceType') + ChannelName = (Get-SafeProperty $auditData 'ChannelName') + ChannelGuid = (Get-SafeProperty $auditData 'ChannelGuid') + ChannelType = (Get-SafeProperty $auditData 'ChannelType') + AppName = (Get-SafeProperty $auditData 'AppName') + EnvironmentName = (Get-SafeProperty $auditData 'EnvironmentName') + PlanId = (Get-SafeProperty $auditData 'PlanId') + PlanName = (Get-SafeProperty $auditData 'PlanName') + TaskId = (Get-SafeProperty $auditData 'TaskId') + TaskName = (Get-SafeProperty $auditData 'TaskName') + PercentComplete = (Get-SafeProperty $auditData 'PercentComplete') + CrossMailboxOperation = (Get-SafeProperty $auditData 'CrossMailboxOperation') + RecordTypeNum = $recordTypeNum + ResultStatus_Audit = $resultStatus + ModelId = (Get-SafeProperty $auditData 'ModelId') + ModelProvider = (Get-SafeProperty $auditData 'ModelProvider') + ModelFamily = (Get-SafeProperty $auditData 'ModelFamily') + TokensTotal = (Get-SafeProperty $auditData 'TokensTotal') + TokensInput = (Get-SafeProperty $auditData 'TokensInput') + TokensOutput = (Get-SafeProperty $auditData 'TokensOutput') + DurationMs = (Get-SafeProperty $auditData 'DurationMs') + OutcomeStatus = (Get-SafeProperty $auditData 'OutcomeStatus') + ConversationId = (Get-SafeProperty $auditData 'ConversationId') + TurnNumber = (Get-SafeProperty $auditData 'TurnNumber') + RetryCount = (Get-SafeProperty $auditData 'RetryCount') + ClientVersion = (Get-SafeProperty $auditData 'ClientVersion') + ClientPlatform = (Get-SafeProperty $auditData 'ClientPlatform') + AgentId = $agentIdVal + AgentName = (Get-SafeProperty $auditData 'AgentName') + AgentVersion = (Get-SafeProperty $auditData 'AgentVersion') + AgentCategory = $agentCat + ApplicationName = (Get-SafeProperty $auditData 'ApplicationName') + SensitivityLabel = (Get-SafeProperty $auditData 'SensitivityLabel') + # CED sub-fields — empty for non-Copilot records + AppHost = '' + ThreadId = '' + Context_Id = '' + Context_Type = '' + Message_Id = '' + Message_isPrompt = '' + AccessedResource_Action = '' + AccessedResource_PolicyDetails = '' + AccessedResource_SiteUrl = '' + AISystemPlugin_Id = '' + AISystemPlugin_Name = '' + ModelTransparencyDetails_ModelName = '' + MessageIds = '' + AccessedResource_Name = '' + AccessedResource_SensitivityLabel = '' + AccessedResource_ResourceType = '' + Context_Item = '' + } + if ($Deep) { + # Deep flatten entire AuditData for each row (no raw JSON) + $flatAudit = ConvertTo-FlatColumns -Node $auditData -Prefix '' -MaxDepth $FlatDepthDeep + foreach ($k in $flatAudit.Keys) { if (-not $rowObj.PSObject.Properties[$k]) { Add-Member -InputObject $rowObj -NotePropertyName $k -NotePropertyValue $flatAudit[$k] -Force } } + } + return @($rowObj) + } + $messages = script:GetArrayFast $ced 'Messages' + if ($PromptFilterValue) { + $filteredMessages = New-Object System.Collections.Generic.List[object] + if ($PromptFilterValue -eq 'Null') { + foreach ($msg in $messages) { if ($null -eq $msg.isPrompt) { $filteredMessages.Add($msg) } } + } + elseif ($PromptFilterValue -eq 'Both') { + foreach ($msg in $messages) { if ($null -ne $msg.isPrompt) { $filteredMessages.Add($msg) } } + } + else { + $targetValue = ($PromptFilterValue -eq 'Prompt') + foreach ($msg in $messages) { try { if ($msg.isPrompt -eq $targetValue) { $filteredMessages.Add($msg) } } catch {} } + } + $messages = $filteredMessages + if ($messages.Count -eq 0) { + if (-not $SkipMetrics) { + $script:metrics.FilteringSkippedRecords++ + $script:metrics.FilteringPromptFiltered++ + } + return @() + } + } + $contexts = script:GetArrayFast $ced 'Contexts' + $resources = script:GetArrayFast $ced 'AccessedResources' + $pluginsRaw = script:GetArrayFast $ced 'AISystemPlugin' + $modelDetRaw = script:GetArrayFast $ced 'ModelTransparencyDetails' + $messageIds = script:GetArrayFast $ced 'MessageIds' + + # DSPM for AI: Extract SensitivityLabels array + $sensitivityLabels = script:GetArrayFast $ced 'SensitivityLabels' + + # DSPM for AI: Determine activity type for conditional 2-level explosion + $activityType = try { $auditData.Operation } catch { $null } + + # DSPM for AI: Extract 2nd-level arrays (for full explosion mode) + $plugins = $null + $recordingSessions = $null + $contextItems = $null + + if (-not $PartialExplode) { + # Full explosion mode: Extract 2nd-level arrays for row count calculation + if ($activityType -eq 'ConnectedAIAppInteraction' -and $appIdentityRaw) { + $plugins = script:GetArrayFast $appIdentityRaw 'Plugins' + } + if ($activityType -eq 'CopilotInteraction' -and $contexts.Count -gt 0) { + # Find max Items[] count across all Contexts + $maxItemsCount = 0 + foreach ($ctx in $contexts) { + if ($ctx) { + $items = script:GetArrayFast $ctx 'Items' + if ($items -and $items.Count -gt $maxItemsCount) { + $maxItemsCount = $items.Count + } + } + } + if ($maxItemsCount -gt 0) { + $contextItems = $maxItemsCount # Store count for row calculation + } + } + } + + if ($PromptFilterValue) { $rowCount = [Math]::Max(1, $messages.Count) } else { + # DSPM for AI: Include all arrays in row count calculation (including AISystemPlugin and ModelTransparencyDetails) + $arrayCounts = @(1, $messages.Count, $contexts.Count, $resources.Count, $sensitivityLabels.Count, $pluginsRaw.Count, $modelDetRaw.Count) + + # Full explosion: include 2nd-level arrays in row count + if (-not $PartialExplode) { + if ($plugins) { $arrayCounts += $plugins.Count } + if ($recordingSessions) { $arrayCounts += $recordingSessions.Count } + if ($contextItems) { $arrayCounts += $contextItems } + } + + $rowCount = ($arrayCounts | Measure-Object -Maximum).Maximum + } + # Removed $plugin0 and $model0 - now using indexed access in row loop for full explosion + $creationDate = script:Format-DatePurviewFast $Record.CreationDate + $creationTime = try { script:Format-DatePurviewFast $auditData.CreationTime } catch { '' } + $appIdentityRaw = (Select-FirstNonNull -Values @((Get-SafeProperty $auditData 'AppIdentity'), (Get-SafeProperty $ced 'AppIdentity'))) + $applicationId = Select-FirstNonNull -Values @((Get-SafeProperty $auditData 'ApplicationId'), (Get-SafeProperty $auditData 'AppId'), (Get-SafeProperty $auditData 'ClientAppId')) + # DeviceProperties NV-pivot: only .OS and .BrowserType (matches M code GetNVProp) + $devProps = Get-SafeProperty $auditData 'DeviceProperties' + $dpOS = ''; $dpBrowser = '' + if ($devProps -and ($devProps -is [System.Collections.IEnumerable])) { + foreach ($dp in $devProps) { + try { + if ($dp.Name -eq 'OS') { $dpOS = $dp.Value } + elseif ($dp.Name -eq 'BrowserType') { $dpBrowser = $dp.Value } + } catch {} + } + } + $appHost = (Select-FirstNonNull -Values @((Get-SafeProperty $ced 'AppHost'), (Get-SafeProperty $auditData 'AppHost'), (Get-SafeProperty $auditData 'Workload'))) + $clientRegion = (Get-SafeProperty $auditData 'ClientRegion') + $agentId = (Get-SafeProperty $auditData 'AgentId') + $agentName = (Get-SafeProperty $auditData 'AgentName') + $agentVersion = (Select-FirstNonNull -Values @((Get-SafeProperty $auditData 'AgentVersion'), (Get-SafeProperty $ced 'AgentVersion'), (Get-SafeProperty $ced 'Version'))) + + # Agent categorization based on AgentId pattern + $agentCategory = "" + if ($agentId) { + if ($agentId -like "CopilotStudio.Declarative.*") { + $agentCategory = "Declarative Agent" + } elseif ($agentId -like "CopilotStudio.CustomEngine.*") { + $agentCategory = "Custom Engine Agent" + } elseif ($agentId -like "P_*") { + $agentCategory = "Declarative Agent (Purview)" + } elseif ($agentId) { + $agentCategory = "Other Agent" + } + } + + # With -IncludeUserInfo, license data now appears only in EntraUsers output + # (or in combined mode via left join) + + $appName = (Select-FirstNonNull -Values @((Get-SafeProperty $auditData 'ApplicationName'), (Get-SafeProperty $ced 'HostAppName'), (Get-SafeProperty $ced 'ClientAppName'))) + $threadId = (Get-SafeProperty $ced 'ThreadId') + $auditUserKey = try { $auditData.UserKey } catch { $null } + # $modelName moved to row loop for indexed access + $clientIP = (Get-SafeProperty $auditData 'ClientIP') + $organizationId = (Get-SafeProperty $auditData 'OrganizationId') + $version = (Get-SafeProperty $auditData 'Version') + $userType = (Get-SafeProperty $auditData 'UserType') + $copilotLogVersion = (Get-SafeProperty $auditData 'CopilotLogVersion') + $workload = (Get-SafeProperty $auditData 'Workload') + + # Extract fields to match ExplodeArrays output for Power BI compatibility + $auditDataId = try { $auditData.Id } catch { $null } + $recordTypeNum = try { $auditData.RecordType } catch { $null } + $resultStatusAudit = try { $auditData.ResultStatus } catch { $null } + $appId = try { $auditData.AppId } catch { $null } + $clientAppId = try { $auditData.ClientAppId } catch { $null } + $correlationId = try { $auditData.CorrelationId } catch { $null } + + # Model and token fields (same as ExplodeArrays) + $modelId = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'ModelId'), (Get-SafeProperty $ced 'ModelID'), (Get-SafeProperty $auditData 'ModelId')) + $modelProvider = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'ModelProvider'), (Get-SafeProperty $ced 'Provider'), (Get-SafeProperty $ced 'ModelVendor')) + $modelFamily = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'ModelFamily'), (Get-SafeProperty $ced 'ModelType')) + $usageNode = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'Usage'), (Get-SafeProperty $ced 'TokenUsage'), (Get-SafeProperty $ced 'Tokens'), (Get-SafeProperty $auditData 'Usage')) + $tokensTotal = $null; $tokensInput = $null; $tokensOutput = $null + if ($usageNode) { + function Local:Get-Num([object]$v) { if ($null -eq $v) { return $null }; try { if ($v -is [string] -and [string]::IsNullOrWhiteSpace($v)) { return $null }; return [double]$v } catch { return $null } } + $tokensTotal = Local:Get-Num (Select-FirstNonNull -Values @((Get-SafeProperty $usageNode 'Total'), (Get-SafeProperty $usageNode 'TotalTokens'), (Get-SafeProperty $usageNode 'TokensTotal'))) + $tokensInput = Local:Get-Num (Select-FirstNonNull -Values @((Get-SafeProperty $usageNode 'Input'), (Get-SafeProperty $usageNode 'Prompt'), (Get-SafeProperty $usageNode 'InputTokens'), (Get-SafeProperty $usageNode 'TokensInput'))) + $tokensOutput = Local:Get-Num (Select-FirstNonNull -Values @((Get-SafeProperty $usageNode 'Output'), (Get-SafeProperty $usageNode 'Completion'), (Get-SafeProperty $usageNode 'OutputTokens'), (Get-SafeProperty $usageNode 'TokensOutput'))) + } + if (-not $tokensTotal -and ($tokensInput -or $tokensOutput)) { try { $tokensTotal = ($tokensInput + $tokensOutput) } catch {} } + + # Duration, outcome, conversation fields (same as ExplodeArrays) + function Local:Get-NumSafe([object]$v) { if ($null -eq $v) { return $null }; try { if ($v -is [string] -and [string]::IsNullOrWhiteSpace($v)) { return $null }; return [double]$v } catch { return $null } } + $durationMs = Local:Get-NumSafe (Select-FirstNonNull -Values @((Get-SafeProperty $ced 'DurationMs'), (Get-SafeProperty $ced 'ElapsedMs'), (Get-SafeProperty $ced 'ProcessingTimeMs'), (Get-SafeProperty $ced 'LatencyMs'))) + $outcomeStatus = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'OutcomeStatus'), (Get-SafeProperty $ced 'Outcome'), (Get-SafeProperty $ced 'Result'), (Get-SafeProperty $ced 'Status')) + if ($outcomeStatus -is [bool]) { $outcomeStatus = if ($outcomeStatus) { 'Success' } else { 'Failure' } } + $conversationId = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'ConversationId'), (Get-SafeProperty $ced 'ConversationID'), (Get-SafeProperty $ced 'SessionId')) + $turnNumber = Local:Get-NumSafe (Select-FirstNonNull -Values @((Get-SafeProperty $ced 'TurnNumber'), (Get-SafeProperty $ced 'TurnIndex'), (Get-SafeProperty $ced 'MessageIndex'))) + $retryCount = Local:Get-NumSafe (Select-FirstNonNull -Values @((Get-SafeProperty $ced 'RetryCount'), (Get-SafeProperty $ced 'Retries'))) + $clientVersion = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'ClientVersion'), (Get-SafeProperty $ced 'Version'), (Get-SafeProperty $ced 'Build')) + $clientPlatform = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'ClientPlatform'), (Get-SafeProperty $ced 'Platform'), (Get-SafeProperty $ced 'OS')) + + $baseSet = New-Object System.Collections.Generic.HashSet[string]; foreach ($c in $PurviewExplodedHeader) { $null = $baseSet.Add($c) } + $rows = New-Object System.Collections.Generic.List[object] + for ($i = 0; $i -lt $rowCount; $i++) { + $rowObj = [PSCustomObject][ordered]@{ + RecordId = $(if ($Record.RecordId) { $Record.RecordId } elseif ($Record.Identity) { $Record.Identity } elseif ($Record.Id) { $Record.Id } else { $auditData.Id }) + CreationDate = $creationDate + RecordType = $Record.RecordType + Operation = $auditData.Operation + UserId = $auditData.UserId + AssociatedAdminUnits = $(try { if ($Record.AssociatedAdminUnits) { $Record.AssociatedAdminUnits } elseif ($auditData.AssociatedAdminUnits) { $auditData.AssociatedAdminUnits } else { '' } } catch { '' }) + AssociatedAdminUnitsNames = $(try { if ($Record.AssociatedAdminUnitsNames) { $Record.AssociatedAdminUnitsNames } elseif ($auditData.AssociatedAdminUnitsNames) { $auditData.AssociatedAdminUnitsNames } else { '' } } catch { '' }) + '@odata.type' = (Get-SafeProperty $auditData '@odata.type') + CreationTime = $creationTime + Id = $auditDataId + OrganizationId = $organizationId + ResultStatus = $resultStatusAudit + UserKey = $auditUserKey + UserType = $userType + Version = $version + Workload = $workload + ClientIP = $clientIP + ObjectId = (Get-SafeProperty $auditData 'ObjectId') + AzureActiveDirectoryEventType = (Get-SafeProperty $auditData 'AzureActiveDirectoryEventType') + ActorContextId = (Get-SafeProperty $auditData 'ActorContextId') + ActorIpAddress = (Get-SafeProperty $auditData 'ActorIpAddress') + InterSystemsId = (Get-SafeProperty $auditData 'InterSystemsId') + IntraSystemId = (Get-SafeProperty $auditData 'IntraSystemId') + SupportTicketId = (Get-SafeProperty $auditData 'SupportTicketId') + TargetContextId = (Get-SafeProperty $auditData 'TargetContextId') + ApplicationId = $applicationId + 'DeviceProperties.OS' = $dpOS + 'DeviceProperties.BrowserType' = $dpBrowser + ErrorNumber = (Get-SafeProperty $auditData 'ErrorNumber') + SiteUrl = (Get-SafeProperty $auditData 'SiteUrl') + SourceRelativeUrl = (Get-SafeProperty $auditData 'SourceRelativeUrl') + SourceFileName = (Get-SafeProperty $auditData 'SourceFileName') + SourceFileExtension = (Get-SafeProperty $auditData 'SourceFileExtension') + ListId = (Get-SafeProperty $auditData 'ListId') + ListItemUniqueId = (Get-SafeProperty $auditData 'ListItemUniqueId') + WebId = (Get-SafeProperty $auditData 'WebId') + ApplicationDisplayName = (Get-SafeProperty $auditData 'ApplicationDisplayName') + EventSource = (Get-SafeProperty $auditData 'EventSource') + ItemType = (Get-SafeProperty $auditData 'ItemType') + SiteSensitivityLabelId = (Get-SafeProperty $auditData 'SiteSensitivityLabelId') + GeoLocation = (Get-SafeProperty $auditData 'GeoLocation') + IsManagedDevice = (Get-SafeProperty $auditData 'IsManagedDevice') + DeviceDisplayName = (Get-SafeProperty $auditData 'DeviceDisplayName') + ListBaseType = (Get-SafeProperty $auditData 'ListBaseType') + ListServerTemplate = (Get-SafeProperty $auditData 'ListServerTemplate') + AuthenticationType = (Get-SafeProperty $auditData 'AuthenticationType') + Site = (Get-SafeProperty $auditData 'Site') + DoNotDistributeEvent = (Get-SafeProperty $auditData 'DoNotDistributeEvent') + HighPriorityMediaProcessing = (Get-SafeProperty $auditData 'HighPriorityMediaProcessing') + BrowserName = (Get-SafeProperty $auditData 'BrowserName') + BrowserVersion = (Get-SafeProperty $auditData 'BrowserVersion') + CorrelationId = $correlationId + Platform = (Get-SafeProperty $auditData 'Platform') + UserAgent = (Get-SafeProperty $auditData 'UserAgent') + ActorInfoString = (Get-SafeProperty $auditData 'ActorInfoString') + AppId = $appId + AuthType = (Get-SafeProperty $auditData 'AuthType') + ClientAppId = $clientAppId + ClientIPAddress = (Get-SafeProperty $auditData 'ClientIPAddress') + ClientInfoString = (Get-SafeProperty $auditData 'ClientInfoString') + ExternalAccess = (Get-SafeProperty $auditData 'ExternalAccess') + InternalLogonType = (Get-SafeProperty $auditData 'InternalLogonType') + LogonType = (Get-SafeProperty $auditData 'LogonType') + LogonUserSid = (Get-SafeProperty $auditData 'LogonUserSid') + MailboxGuid = (Get-SafeProperty $auditData 'MailboxGuid') + MailboxOwnerSid = (Get-SafeProperty $auditData 'MailboxOwnerSid') + MailboxOwnerUPN = (Get-SafeProperty $auditData 'MailboxOwnerUPN') + OrganizationName = (Get-SafeProperty $auditData 'OrganizationName') + OriginatingServer = (Get-SafeProperty $auditData 'OriginatingServer') + SessionId = (Get-SafeProperty $auditData 'SessionId') + TokenObjectId = (Get-SafeProperty $auditData 'TokenObjectId') + TokenTenantId = (Get-SafeProperty $auditData 'TokenTenantId') + TokenType = (Get-SafeProperty $auditData 'TokenType') + SaveToSentItems = (Get-SafeProperty $auditData 'SaveToSentItems') + OperationCount = (Get-SafeProperty $auditData 'OperationCount') + FileSizeBytes = (Get-SafeProperty $auditData 'FileSizeBytes') + MeetingId = (Get-SafeProperty $auditData 'MeetingId') + MeetingType = (Get-SafeProperty $auditData 'MeetingType') + EventSignature = (Get-SafeProperty $auditData 'EventSignature') + EventData = (Get-SafeProperty $auditData 'EventData') + Permission = (Get-SafeProperty $auditData 'Permission') + SensitivityLabelId = (Get-SafeProperty $auditData 'SensitivityLabelId') + SharingLinkScope = (Get-SafeProperty $auditData 'SharingLinkScope') + TargetUserOrGroupType = (Get-SafeProperty $auditData 'TargetUserOrGroupType') + TargetUserOrGroupName = (Get-SafeProperty $auditData 'TargetUserOrGroupName') + MeetingURL = (Get-SafeProperty $auditData 'MeetingURL') + ChatId = (Get-SafeProperty $auditData 'ChatId') + MessageId = (Get-SafeProperty $auditData 'MessageId') + MessageSizeInBytes = (Get-SafeProperty $auditData 'MessageSizeInBytes') + MessageType = (Get-SafeProperty $auditData 'MessageType') + FormId = (Get-SafeProperty $auditData 'FormId') + FormName = (Get-SafeProperty $auditData 'FormName') + VideoId = (Get-SafeProperty $auditData 'VideoId') + VideoName = (Get-SafeProperty $auditData 'VideoName') + ChannelId = (Get-SafeProperty $auditData 'ChannelId') + ViewDuration = (Get-SafeProperty $auditData 'ViewDuration') + ClientRegion = $clientRegion + CopilotLogVersion = $copilotLogVersion + TargetId = (Get-SafeProperty $auditData 'TargetId') + TeamName = (Get-SafeProperty $auditData 'TeamName') + TeamGuid = (Get-SafeProperty $auditData 'TeamGuid') + ResponseId = (Get-SafeProperty $auditData 'ResponseId') + IsAnonymous = (Get-SafeProperty $auditData 'IsAnonymous') + DeviceType = (Get-SafeProperty $auditData 'DeviceType') + ChannelName = (Get-SafeProperty $auditData 'ChannelName') + ChannelGuid = (Get-SafeProperty $auditData 'ChannelGuid') + ChannelType = (Get-SafeProperty $auditData 'ChannelType') + AppName = (Get-SafeProperty $auditData 'AppName') + EnvironmentName = (Get-SafeProperty $auditData 'EnvironmentName') + PlanId = (Get-SafeProperty $auditData 'PlanId') + PlanName = (Get-SafeProperty $auditData 'PlanName') + TaskId = (Get-SafeProperty $auditData 'TaskId') + TaskName = (Get-SafeProperty $auditData 'TaskName') + PercentComplete = (Get-SafeProperty $auditData 'PercentComplete') + CrossMailboxOperation = (Get-SafeProperty $auditData 'CrossMailboxOperation') + RecordTypeNum = $(try { [int]$Record.RecordType } catch { $Record.RecordType }) + ResultStatus_Audit = $resultStatusAudit + ModelId = $modelId + ModelProvider = $modelProvider + ModelFamily = $modelFamily + TokensTotal = $tokensTotal + TokensInput = $tokensInput + TokensOutput = $tokensOutput + DurationMs = $durationMs + OutcomeStatus = $outcomeStatus + ConversationId = $conversationId + TurnNumber = $turnNumber + RetryCount = $retryCount + ClientVersion = $clientVersion + ClientPlatform = $clientPlatform + AgentId = $agentId + AgentName = $agentName + AgentVersion = $agentVersion + AgentCategory = $agentCategory + ApplicationName = (Get-SafeProperty $auditData 'ApplicationName') + SensitivityLabel = $(if ($i -lt $sensitivityLabels.Count) { try { [string]$sensitivityLabels[$i] } catch { '' } } else { '' }) + AppHost = $appHost + ThreadId = $threadId + Context_Id = $(if ($i -lt $contexts.Count -and $contexts[$i]) { try { Get-SafeProperty $contexts[$i] 'Id' } catch { '' } } else { '' }) + Context_Type = $(if ($i -lt $contexts.Count -and $contexts[$i]) { try { Get-SafeProperty $contexts[$i] 'Type' } catch { '' } } else { '' }) + Message_Id = $(if ($i -lt $messages.Count) { $msg = $messages[$i]; if ($msg -is [psobject]) { try { Get-SafeProperty $msg 'Id' } catch { '' } } else { $msg } } else { '' }) + Message_isPrompt = $(if ($i -lt $messages.Count) { $msg = $messages[$i]; if ($msg -is [psobject]) { try { script:BoolTFFast (Get-SafeProperty $msg 'isPrompt') } catch { '' } } else { '' } } else { '' }) + AccessedResource_Action = $(if ($i -lt $resources.Count -and $resources[$i]) { try { Get-SafeProperty $resources[$i] 'Action' } catch { '' } } else { '' }) + AccessedResource_PolicyDetails = $(if ($i -lt $resources.Count -and $resources[$i]) { try { script:ToJsonIfObjectFast (Get-SafeProperty $resources[$i] 'PolicyDetails') } catch { '' } } else { '' }) + AccessedResource_SiteUrl = $(if ($i -lt $resources.Count -and $resources[$i]) { try { Get-SafeProperty $resources[$i] 'SiteUrl' } catch { '' } } else { '' }) + AISystemPlugin_Id = $(if ($i -lt $pluginsRaw.Count -and $pluginsRaw[$i]) { try { Get-SafeProperty $pluginsRaw[$i] 'Id' } catch { '' } } else { '' }) + AISystemPlugin_Name = $(if ($i -lt $pluginsRaw.Count -and $pluginsRaw[$i]) { try { Get-SafeProperty $pluginsRaw[$i] 'Name' } catch { '' } } else { '' }) + ModelTransparencyDetails_ModelName = $(if ($i -lt $modelDetRaw.Count -and $modelDetRaw[$i]) { try { Get-SafeProperty $modelDetRaw[$i] 'ModelName' } catch { '' } } else { '' }) + MessageIds = $(if ($messageIds.Count -gt 0) { $messageIds -join ';' } else { '' }) + AccessedResource_Name = $(if ($i -lt $resources.Count -and $resources[$i]) { try { Get-SafeProperty $resources[$i] 'Name' } catch { '' } } else { '' }) + AccessedResource_SensitivityLabel = $(if ($i -lt $resources.Count -and $resources[$i]) { try { Get-SafeProperty $resources[$i] 'SensitivityLabel' } catch { '' } } else { '' }) + AccessedResource_ResourceType = $(if ($i -lt $resources.Count -and $resources[$i]) { try { Get-SafeProperty $resources[$i] 'ResourceType' } catch { '' } } else { '' }) + Context_Item = $( + if ($activityType -eq 'CopilotInteraction') { + if ($PartialExplode) { + if ($i -lt $contexts.Count -and $contexts[$i]) { + try { + $items = script:GetArrayFast $contexts[$i] 'Items' + if ($items -and $items.Count -gt 0) { + ($items | ForEach-Object { try { script:ToJsonIfObjectFast $_ } catch { '' } }) -join ';' + } else { '' } + } catch { '' } + } else { '' } + } else { + try { + $foundItem = $null + foreach ($ctx in $contexts) { + if ($ctx) { + $items = script:GetArrayFast $ctx 'Items' + if ($items -and $i -lt $items.Count) { + $foundItem = $items[$i] + break + } + } + } + if ($foundItem) { script:ToJsonIfObjectFast $foundItem } else { '' } + } catch { '' } + } + } else { '' } + ) + } + + # Partial explosion mode: Preserve AuditData column (full JSON) for downstream processing + if ($PartialExplode) { + try { + Add-Member -InputObject $rowObj -NotePropertyName 'AuditData' -NotePropertyValue $Record.AuditData -Force + } catch {} + } + + # DSPM for AI: 2-level explosion for ConnectedAIAppInteraction (AppIdentity.Plugins[]) + if ($activityType -eq 'ConnectedAIAppInteraction' -and $plugins) { + try { + if ($PartialExplode) { + # Partial mode: Semi-colon-joined JSON for all plugins + $pluginsList = ($plugins | ForEach-Object { try { script:ToJsonIfObjectFast $_ } catch { '' } }) -join ';' + if (-not $rowObj.PSObject.Properties['AppIdentity_Plugins']) { + Add-Member -InputObject $rowObj -NotePropertyName 'AppIdentity_Plugins' -NotePropertyValue $pluginsList -Force + if (-not $script:DeepExtraColumns.Contains('AppIdentity_Plugins')) { [void]$script:DeepExtraColumns.Add('AppIdentity_Plugins') } + } + } else { + # Full mode: One plugin per row + if ($i -lt $plugins.Count) { + $plugin = $plugins[$i] + $pluginJson = try { script:ToJsonIfObjectFast $plugin } catch { '' } + if (-not $rowObj.PSObject.Properties['AppIdentity_Plugin']) { + Add-Member -InputObject $rowObj -NotePropertyName 'AppIdentity_Plugin' -NotePropertyValue $pluginJson -Force + if (-not $script:DeepExtraColumns.Contains('AppIdentity_Plugin')) { [void]$script:DeepExtraColumns.Add('AppIdentity_Plugin') } + } + } + } + } catch {} + } + + if ($Deep) { + if ($ced) { + $flat = ConvertTo-FlatColumns -Node $ced -Prefix '' -MaxDepth $FlatDepthDeep + foreach ($k in $flat.Keys) { if ($baseSet.Contains($k)) { continue }; if (-not $rowObj.PSObject.Properties[$k]) { if (-not $script:DeepExtraColumns.Contains($k)) { [void]$script:DeepExtraColumns.Add($k) }; try { Add-Member -InputObject $rowObj -NotePropertyName $k -NotePropertyValue $flat[$k] -Force } catch {} } } + } + if ($auditData) { + $auditDataClone = [PSCustomObject]@{} + foreach ($prop in $auditData.PSObject.Properties) { if ($prop.Name -ne 'CopilotEventData') { Add-Member -InputObject $auditDataClone -NotePropertyName $prop.Name -NotePropertyValue $prop.Value -Force } } + $flatAudit = ConvertTo-FlatColumns -Node $auditDataClone -Prefix '' -MaxDepth $FlatDepthDeep + foreach ($k in $flatAudit.Keys) { if ($baseSet.Contains($k)) { continue }; if (-not $rowObj.PSObject.Properties[$k]) { if (-not $script:DeepExtraColumns.Contains($k)) { [void]$script:DeepExtraColumns.Add($k) }; try { Add-Member -InputObject $rowObj -NotePropertyName $k -NotePropertyValue $flatAudit[$k] -Force } catch {} } } + } + } + $rows.Add($rowObj) | Out-Null + } + if (-not $SkipMetrics -and $rows.Count -gt 1) { try { $script:metrics.ExplosionEvents += 1; $script:metrics.ExplosionRowsFromEvents += ($rows.Count - 1); if ($rows.Count -gt $script:metrics.ExplosionMaxPerRecord) { $script:metrics.ExplosionMaxPerRecord = $rows.Count } } catch {} } + return $rows + } + catch { + if (-not $SkipMetrics) { + $script:metrics.FilteringSkippedRecords++ + $script:metrics.FilteringParseFailures++ + } + Write-LogHost "Failed Purview explosion: $($_.Exception.Message)" -ForegroundColor Red + return @() + } +} + +function Select-FirstNonNull { param([object[]]$Values) foreach ($v in $Values) { if ($null -ne $v -and ('' -ne [string]$v)) { return $v } } return $null } + +function Convert-ToStructuredRecord { + # Uses proven stable implementation for record conversion + param( + [Parameter(Mandatory = $true)] $Record, + [bool]$EnableExplosion = $false + ) + try { + function Local:Get-Num([object]$v) { if ($null -eq $v) { return $null }; try { if ($v -is [string] -and [string]::IsNullOrWhiteSpace($v)) { return $null }; return [double]$v } catch { return $null } } + function Local:Add-OrUpdate([pscustomobject]$obj, [string]$name, $value) { try { if ($obj.PSObject.Properties[$name]) { $obj.PSObject.Properties[$name].Value = $value } else { Add-Member -InputObject $obj -NotePropertyName $name -NotePropertyValue $value -Force } } catch {} } + # Use pre-parsed AuditData if available + $auditData = if ($Record.PSObject.Properties['_ParsedAuditData']) { $Record._ParsedAuditData } else { try { $Record.AuditData | ConvertFrom-Json -ErrorAction Stop } catch { $null } } + if (-not $auditData) { + $script:metrics.FilteringSkippedRecords++ + $script:metrics.FilteringMissingAuditData++ + return @() + } + + # NON-EXPLOSION MODE: Return 8-column compact record matching Purview UI export schema + if (-not $EnableExplosion -and -not $ExplodeDeep) { + $compactRecord = [pscustomobject]@{ + RecordId = $(if ($Record.RecordId) { $Record.RecordId } elseif ($Record.Identity) { $Record.Identity } elseif ($Record.Id) { $Record.Id } else { $auditData.Id }) + CreationDate = $Record.CreationDate.ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ss.fffZ') + RecordType = $Record.RecordType + Operation = $(try { $auditData.Operation } catch { if ($Record.Operation) { $Record.Operation } else { $Record.Operations } }) + UserId = if ($Record.UserId) { $Record.UserId } elseif ($Record.UserIds) { $Record.UserIds } else { '' } + AuditData = $Record.AuditData + AssociatedAdminUnits = $(try { if ($auditData.AssociatedAdminUnits) { $auditData.AssociatedAdminUnits } elseif ($Record.AssociatedAdminUnits) { $Record.AssociatedAdminUnits } else { '' } } catch { '' }) + AssociatedAdminUnitsNames = $(try { if ($auditData.AssociatedAdminUnitsNames) { $auditData.AssociatedAdminUnitsNames } elseif ($Record.AssociatedAdminUnitsNames) { $Record.AssociatedAdminUnitsNames } else { '' } } catch { '' }) + } + return @($compactRecord) + } + + # EXPLOSION MODE: Extract and flatten all fields from AuditData (no raw JSON columns) + $ced = Get-SafeProperty $auditData 'CopilotEventData' + $modelId = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'ModelId'), (Get-SafeProperty $ced 'ModelID'), (Get-SafeProperty $auditData 'ModelId')) + $modelProvider = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'ModelProvider'), (Get-SafeProperty $ced 'Provider'), (Get-SafeProperty $ced 'ModelVendor')) + $modelFamily = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'ModelFamily'), (Get-SafeProperty $ced 'ModelType')) + $usageNode = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'Usage'), (Get-SafeProperty $ced 'TokenUsage'), (Get-SafeProperty $ced 'Tokens'), (Get-SafeProperty $auditData 'Usage')) + $tokensTotal = $null; $tokensInput = $null; $tokensOutput = $null + if ($usageNode) { + $tokensTotal = Local:Get-Num (Select-FirstNonNull -Values @((Get-SafeProperty $usageNode 'Total'), (Get-SafeProperty $usageNode 'TotalTokens'), (Get-SafeProperty $usageNode 'TokensTotal'))) + $tokensInput = Local:Get-Num (Select-FirstNonNull -Values @((Get-SafeProperty $usageNode 'Input'), (Get-SafeProperty $usageNode 'Prompt'), (Get-SafeProperty $usageNode 'InputTokens'), (Get-SafeProperty $usageNode 'TokensInput'))) + $tokensOutput = Local:Get-Num (Select-FirstNonNull -Values @((Get-SafeProperty $usageNode 'Output'), (Get-SafeProperty $usageNode 'Completion'), (Get-SafeProperty $usageNode 'OutputTokens'), (Get-SafeProperty $usageNode 'TokensOutput'))) + } + if (-not $tokensTotal -and ($tokensInput -or $tokensOutput)) { try { $tokensTotal = ($tokensInput + $tokensOutput) } catch {} } + $durationMs = Local:Get-Num (Select-FirstNonNull -Values @((Get-SafeProperty $ced 'DurationMs'), (Get-SafeProperty $ced 'ElapsedMs'), (Get-SafeProperty $ced 'ProcessingTimeMs'), (Get-SafeProperty $ced 'LatencyMs'))) + $outcomeStatus = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'OutcomeStatus'), (Get-SafeProperty $ced 'Outcome'), (Get-SafeProperty $ced 'Result'), (Get-SafeProperty $ced 'Status')) + if ($outcomeStatus -is [bool]) { $outcomeStatus = if ($outcomeStatus) { 'Success' } else { 'Failure' } } + $conversationId = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'ConversationId'), (Get-SafeProperty $ced 'ConversationID'), (Get-SafeProperty $ced 'SessionId')) + $turnNumber = Local:Get-Num (Select-FirstNonNull -Values @((Get-SafeProperty $ced 'TurnNumber'), (Get-SafeProperty $ced 'TurnIndex'), (Get-SafeProperty $ced 'MessageIndex'))) + $retryCount = Local:Get-Num (Select-FirstNonNull -Values @((Get-SafeProperty $ced 'RetryCount'), (Get-SafeProperty $ced 'Retries'))) + $clientVersion = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'ClientVersion'), (Get-SafeProperty $ced 'Version'), (Get-SafeProperty $ced 'Build')) + $clientPlatform = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'ClientPlatform'), (Get-SafeProperty $ced 'Platform'), (Get-SafeProperty $ced 'OS')) + $agentId = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'AgentId'), (Get-SafeProperty $ced 'AgentID'), (Get-SafeProperty $ced 'AssistantId')) + $agentName = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'AgentName'), (Get-SafeProperty $ced 'AssistantName')) + $agentVersion = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'AgentVersion'), (Get-SafeProperty $ced 'Version')) + + # Agent categorization based on AgentId pattern + $agentCategory = "" + if ($agentId) { + if ($agentId -like "CopilotStudio.Declarative.*") { + $agentCategory = "Declarative Agent" + } elseif ($agentId -like "CopilotStudio.CustomEngine.*") { + $agentCategory = "Custom Engine Agent" + } elseif ($agentId -like "P_*") { + $agentCategory = "Declarative Agent (Purview)" + } elseif ($agentId) { + $agentCategory = "Other Agent" + } + } + + # With -IncludeUserInfo, license data now appears only in EntraUsers output + # (or in combined mode via left join) + + $appIdentity = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'AppIdentity'), (Get-SafeProperty $ced 'ApplicationId'), (Get-SafeProperty $ced 'HostAppId')) + $applicationName = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'ApplicationName'), (Get-SafeProperty $ced 'HostAppName'), (Get-SafeProperty $ced 'ClientAppName')) + $suggestions = (Get-SafeProperty $ced 'Suggestions'); if (-not $suggestions) { $suggestions = Get-SafeProperty $ced 'SuggestionList' } + $actions = Get-SafeProperty $ced 'Actions' + $references = Select-FirstNonNull -Values @((Get-SafeProperty $ced 'References'), (Get-SafeProperty $ced 'Sources'), (Get-SafeProperty $ced 'Citations')) + $participants = Get-SafeProperty $ced 'Participants' + function Local:Measure-Collection($items, [string]$prefix) { + $result = @{}; if (-not $items) { return $result }; $arr = @($items); if ($arr.Count -eq 0) { return $result } + $result["${prefix}Count"] = $arr.Count; $types = New-Object System.Collections.Generic.HashSet[string]; $latencies = @(); $edits = @(); $accepted = 0; $success = 0; $failure = 0 + foreach ($s in $arr) { + foreach ($cand in @('Type', 'SuggestionType', 'Name', 'Kind', 'ActionType')) { try { if ($s.PSObject.Properties[$cand]) { [void]$types.Add([string]$s.$cand); break } } catch {} } + foreach ($lat in @('LatencyMs', 'DurationMs', 'ElapsedMs')) { try { if ($s.PSObject.Properties[$lat]) { $v = Local:Get-Num $s.$lat; if ($null -ne $v) { $latencies += $v; break } } } catch {} } + foreach ($ed in @('EditCount', 'Edits', 'EditsCount')) { try { if ($s.PSObject.Properties[$ed]) { $v = Local:Get-Num $s.$ed; if ($null -ne $v) { $edits += $v; break } } } catch {} } + foreach ($acc in @('Accepted', 'IsAccepted', 'Success', 'Succeeded')) { try { if ($s.PSObject.Properties[$acc]) { $val = $s.$acc; if ($val -is [bool]) { if ($val) { $accepted++ } } elseif ($val -match '^(?i:true|yes|1|success)') { $accepted++ } } } catch {} } + foreach ($succ in @('Success', 'Succeeded')) { try { if ($s.PSObject.Properties[$succ]) { $val = $s.$succ; if ($val -is [bool]) { if ($val) { $success++ } else { $failure++ } } elseif ($val -match '^(?i:true|yes|1|success)') { $success++ } else { $failure++ } } } catch {} } + } + if ($types.Count -gt 0) { $result["${prefix}Types"] = [string]::Join(';', [array]$types) } + if ($latencies.Count -gt 0) { $result["${prefix}AvgLatencyMs"] = [math]::Round(($latencies | Measure-Object -Average).Average, 2) } + if ($edits.Count -gt 0) { $result["${prefix}AvgEdits"] = [math]::Round(($edits | Measure-Object -Average).Average, 2); $result["${prefix}TotalEdits"] = ($edits | Measure-Object -Sum).Sum } + if ($accepted -gt 0) { $result["${prefix}Accepted"] = $accepted; $result["${prefix}AcceptanceRate"] = [math]::Round(($accepted / $arr.Count) * 100, 2) } + if ($success -gt 0 -or $failure -gt 0) { $result["${prefix}Success"] = $success; $result["${prefix}Failure"] = $failure } + return $result + } + $suggestAgg = Local:Measure-Collection $suggestions 'Suggestions' + $actionAgg = Local:Measure-Collection $actions 'Actions' + $refAgg = Local:Measure-Collection $references 'References' + $partAgg = Local:Measure-Collection $participants 'Participants' + $baseRecord = [pscustomobject]@{ + RecordId = $(if ($Record.RecordId) { $Record.RecordId } elseif ($Record.Identity) { $Record.Identity } elseif ($Record.Id) { $Record.Id } else { $auditData.Id }) + RecordType = $Record.RecordType + CreationDate = $Record.CreationDate.ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ss.fffZ') + ResultStatus = $Record.ResultStatus + ResultCount = $Record.ResultCount + Identity = $Record.Identity + IsValid = $Record.IsValid + ObjectState = $Record.ObjectState + Id = $auditData.Id + CreationTime = & { $ct = script:Parse-DateSafe $auditData.CreationTime; if ($ct) { $ct.ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ss.fffZ') } else { $auditData.CreationTime } } + Operation = $auditData.Operation + OrganizationId = $auditData.OrganizationId + RecordTypeNum = $auditData.RecordType + ResultStatus_Audit = $auditData.ResultStatus + UserKey = $auditData.UserKey + UserType = $auditData.UserType + Version = $auditData.Version + Workload = $auditData.Workload + UserId = $auditData.UserId + AppId = $auditData.AppId + ClientAppId = $auditData.ClientAppId + CorrelationId = $auditData.CorrelationId + ModelId = $modelId + ModelProvider = $modelProvider + ModelFamily = $modelFamily + TokensTotal = $tokensTotal + TokensInput = $tokensInput + TokensOutput = $tokensOutput + DurationMs = $durationMs + OutcomeStatus = $outcomeStatus + ConversationId = $conversationId + TurnNumber = $turnNumber + RetryCount = $retryCount + ClientVersion = $clientVersion + ClientPlatform = $clientPlatform + AgentId = $agentId + AgentName = $agentName + AgentVersion = $agentVersion + AgentCategory = $agentCategory + AppIdentity = $appIdentity + ApplicationName = $applicationName + } + # Flatten AppAccessContext for Copilot/AI records as well + try { + $aac = Get-SafeProperty $auditData 'AppAccessContext' + if ($aac -and -not (Test-ScalarValue $aac)) { + $flatAac = ConvertTo-FlatColumns -Node $aac -Prefix 'AppAccessContext.' -MaxDepth $FlatDepthStandard + foreach ($k in $flatAac.Keys) { if (-not $baseRecord.PSObject.Properties[$k]) { Add-Member -InputObject $baseRecord -NotePropertyName $k -NotePropertyValue $flatAac[$k] -Force } } + if ($baseRecord.PSObject.Properties['AppAccessContext']) { $baseRecord.PSObject.Members.Remove('AppAccessContext') } + } + elseif ($aac -and (Test-ScalarValue $aac)) { + if (-not $baseRecord.PSObject.Properties['AppAccessContext']) { Add-Member -InputObject $baseRecord -NotePropertyName 'AppAccessContext' -NotePropertyValue $aac -Force } + } + } catch {} + foreach ($k in $suggestAgg.Keys) { Add-OrUpdate $baseRecord $k $suggestAgg[$k] } + foreach ($k in $actionAgg.Keys) { Add-OrUpdate $baseRecord $k $actionAgg[$k] } + foreach ($k in $refAgg.Keys) { Add-OrUpdate $baseRecord $k $refAgg[$k] } + foreach ($k in $partAgg.Keys) { Add-OrUpdate $baseRecord $k $partAgg[$k] } + + # If not doing array explosion, return base record now + if (-not $EnableExplosion) { return @($baseRecord) } + $rows = @($baseRecord) + $arraysToExplode = @( + @{ Name = 'Suggestions'; Data = $suggestions; Prefix = 'Suggestion'; Enabled = $suggestions }, + @{ Name = 'Actions'; Data = $actions; Prefix = 'Action'; Enabled = $actions }, + @{ Name = 'References'; Data = $references; Prefix = 'Reference'; Enabled = $references }, + @{ Name = 'Participants'; Data = $participants; Prefix = 'Participant'; Enabled = $participants } + ) + $maxRows = $ExplosionPerRecordRowCap + foreach ($entry in $arraysToExplode) { + if (-not $entry.Enabled) { continue } + $dataArr = @($entry.Data); if ($dataArr.Count -eq 0) { continue } + $newRows = New-Object System.Collections.ArrayList + foreach ($r in $rows) { + $idx = 0 + foreach ($el in $dataArr) { + $nr = [pscustomobject]@{} + foreach ($p in $r.PSObject.Properties) { Add-Member -InputObject $nr -NotePropertyName $p.Name -NotePropertyValue $p.Value -Force } + Add-OrUpdate $nr ("ArrayIndex_{0}" -f $entry.Name) $idx + if ($el) { + foreach ($prop in $el.PSObject.Properties) { + $pname = ("{0}_{1}" -f $entry.Prefix, $prop.Name) + if ($nr.PSObject.Properties[$pname]) { continue } + $val = $prop.Value + if (Test-ScalarValue $val) { Add-OrUpdate $nr $pname $val } else { try { Add-OrUpdate $nr $pname ($val | ConvertTo-Json -Depth $JsonDepth -Compress) } catch {} } + } + } + [void]$newRows.Add($nr); $idx++ + if ($newRows.Count -gt $maxRows) { break } + } + if ($newRows.Count -gt $maxRows) { break } + } + $rows = @($newRows) + if ($rows.Count -gt $maxRows) { break } + } + if ($rows.Count -gt $maxRows) { foreach ($r in $rows) { Add-OrUpdate $r 'ExplosionTruncated' $true }; $rows = $rows[0..($maxRows - 1)]; try { $script:metrics.ExplosionTruncated = $true } catch {} } + if ($ExplodeDeep -and $ced) { + for ($i = 0; $i -lt $rows.Count; $i++) { + $r = $rows[$i] + $flat = ConvertTo-FlatColumns -Node $ced -Prefix '' -MaxDepth $FlatDepthStandard + foreach ($ck in $flat.Keys) { if (-not $r.PSObject.Properties[$ck]) { Add-OrUpdate $r $ck $flat[$ck] } } + } + } + return $rows + } + catch { + $script:metrics.FilteringSkippedRecords++ + $script:metrics.FilteringParseFailures++ + Write-LogHost "Failed to process record: $($_.Exception.Message)" -ForegroundColor Red + return @() + } +} + +try { + # Unregister the early exit handler since catch/finally will handle Ctrl+C from this point + # This prevents duplicate "Script Interrupted" messages + Unregister-Event -SourceIdentifier PowerShell.Exiting -ErrorAction SilentlyContinue + + # ============================================================ + # RESUME MODE VALIDATION - Ensure no conflicting parameters + # ============================================================ + if ($PSBoundParameters.ContainsKey('Resume')) { + # Resume mode is standalone - only auth-related parameters allowed + $allowedWithResume = @( + 'Resume', + 'Force', + 'Auth', + 'TenantId', + 'ClientId', + 'ClientSecret', + 'ClientCertificateThumbprint', + 'ClientCertificateStoreLocation', + 'ClientCertificatePath', + 'ClientCertificatePassword', + # Standard PowerShell common parameters + 'Verbose', + 'Debug', + 'ErrorAction', + 'WarningAction', + 'InformationAction', + 'ErrorVariable', + 'WarningVariable', + 'InformationVariable', + 'OutVariable', + 'OutBuffer', + 'PipelineVariable' + ) + + $invalidParams = @($PSBoundParameters.Keys | Where-Object { $_ -notin $allowedWithResume }) + + if ($invalidParams.Count -gt 0) { + Write-Host "" + Write-Host "════════════════════════════════════════════════════════════════════════════════" -ForegroundColor Red + Write-Host " ERROR: Invalid parameters used with -Resume" -ForegroundColor Red + Write-Host "════════════════════════════════════════════════════════════════════════════════" -ForegroundColor Red + Write-Host "" + Write-Host " Resume mode restores ALL settings from the checkpoint file." -ForegroundColor Yellow + Write-Host " You cannot specify other parameters (they would be ignored or cause inconsistency)." -ForegroundColor Yellow + Write-Host "" + Write-Host " Invalid parameters:" -ForegroundColor White + foreach ($p in $invalidParams) { + Write-Host " - $p" -ForegroundColor Red + } + Write-Host "" + Write-Host " ALLOWED with -Resume:" -ForegroundColor Green + Write-Host " -Resume [path] Checkpoint file (or auto-discover)" -ForegroundColor Gray + Write-Host " -Force Use most recent checkpoint without prompting" -ForegroundColor Gray + Write-Host " -Auth Override authentication method" -ForegroundColor Gray + Write-Host " -TenantId Tenant ID (for AppRegistration)" -ForegroundColor Gray + Write-Host " -ClientId Client ID (for AppRegistration)" -ForegroundColor Gray + Write-Host " -ClientSecret Client secret (for AppRegistration)" -ForegroundColor Gray + Write-Host "" + Write-Host " Example usage:" -ForegroundColor Cyan + Write-Host ' .\Script.ps1 -Resume' -ForegroundColor White + Write-Host ' .\Script.ps1 -Resume -Auth DeviceCode' -ForegroundColor White + Write-Host ' .\Script.ps1 -Resume "C:\path\.pax_checkpoint_xxx.json" -Force' -ForegroundColor White + Write-Host "" + exit 1 + } + } + + # ============================================================ + # RESUME MODE DETECTION - Check for checkpoint to resume + # ============================================================ + if ($ResumeSpecified) { + Write-LogHost "" + Write-LogHost "========================================" -ForegroundColor Cyan + Write-LogHost " RESUME MODE DETECTED" -ForegroundColor Cyan + Write-LogHost "========================================" -ForegroundColor Cyan + Write-LogHost "" + + if ($Resume -ne '') { + # Explicit checkpoint path provided + Write-LogHost "Loading checkpoint from explicit path: $Resume" -ForegroundColor Yellow + $checkpointLoadSuccess = Read-Checkpoint -CheckpointPath $Resume + if (-not $checkpointLoadSuccess) { + Write-LogHost "ERROR: Failed to load checkpoint file. Cannot resume." -ForegroundColor Red + exit 1 + } + $script:CheckpointPath = $Resume + # Read-Checkpoint sets $script:CheckpointData on success + $checkpointData = $script:CheckpointData + } + else { + # Auto-discover checkpoints in OutputPath + $searchPath = if ($OutputPath) { $OutputPath } else { (Get-Location).Path } + Write-LogHost "Searching for checkpoints in: $searchPath" -ForegroundColor Yellow + + $checkpoints = Find-Checkpoints -OutputPath $searchPath + + if ($checkpoints.Count -eq 0) { + Write-LogHost "" + Write-LogHost "════════════════════════════════════════════════════════════════════════════════" -ForegroundColor Red + Write-LogHost " NO CHECKPOINT FILES FOUND" -ForegroundColor Red + Write-LogHost "════════════════════════════════════════════════════════════════════════════════" -ForegroundColor Red + Write-LogHost "" + Write-LogHost " Searched in: $searchPath" -ForegroundColor White + Write-LogHost "" + Write-LogHost " Checkpoint files are named: .pax_checkpoint_YYYYMMDD_HHMMSS.json" -ForegroundColor Gray + Write-LogHost " They are saved in the same folder as the _PARTIAL.csv output file." -ForegroundColor Gray + Write-LogHost "" + Write-LogHost " COMMON LOCATIONS TO CHECK:" -ForegroundColor Yellow + Write-LogHost " • The 'output' folder where you typically save exports" -ForegroundColor White + Write-LogHost " • The folder shown in the Ctrl+C message when the run was interrupted" -ForegroundColor White + Write-LogHost " • Look for _PARTIAL.csv files - the checkpoint is in the same folder" -ForegroundColor White + Write-LogHost "" + Write-LogHost " HOW TO RESUME:" -ForegroundColor Cyan + Write-LogHost "" + Write-LogHost " Option 1: Specify the folder containing the checkpoint:" -ForegroundColor White + Write-LogHost " -Resume -OutputPath `"C:\path\to\output\folder`"" -ForegroundColor Green + Write-LogHost "" + Write-LogHost " Option 2: Specify the checkpoint file directly:" -ForegroundColor White + Write-LogHost " -Resume `"C:\path\to\.pax_checkpoint_20260120_123456.json`"" -ForegroundColor Green + Write-LogHost "" + Write-LogHost " Option 3: Run from the folder containing the checkpoint:" -ForegroundColor White + Write-LogHost " cd `"C:\path\to\output\folder`"" -ForegroundColor Green + Write-LogHost " pwsh -File `"...\PAX_Purview_Audit_Log_Processor.ps1`" -Resume" -ForegroundColor Green + Write-LogHost "" + Write-LogHost "════════════════════════════════════════════════════════════════════════════════" -ForegroundColor Red + Write-LogHost "" + exit 1 + } + elseif ($checkpoints.Count -eq 1) { + $selectedCheckpoint = $checkpoints[0] + Write-LogHost "Found checkpoint: $($selectedCheckpoint.FileName)" -ForegroundColor Green + } + else { + # Multiple checkpoints found + if ($Force) { + # Use most recent without prompting + $selectedCheckpoint = $checkpoints | Sort-Object { $_.LastUpdated } -Descending | Select-Object -First 1 + Write-LogHost "Multiple checkpoints found. -Force specified, using most recent:" -ForegroundColor Yellow + Write-LogHost " $($selectedCheckpoint.FileName)" -ForegroundColor White + } + else { + # Prompt user to select + $selectedCheckpoint = Select-Checkpoint -Checkpoints $checkpoints + if (-not $selectedCheckpoint) { + Write-LogHost "No checkpoint selected. Exiting." -ForegroundColor Yellow + exit 0 + } + } + } + + $script:CheckpointPath = $selectedCheckpoint.Path + $checkpointLoadSuccess = Read-Checkpoint -CheckpointPath $script:CheckpointPath + if (-not $checkpointLoadSuccess) { + Write-LogHost "ERROR: Failed to load checkpoint file. Cannot resume." -ForegroundColor Red + exit 1 + } + # Read-Checkpoint sets $script:CheckpointData on success + $checkpointData = $script:CheckpointData + } + + # Note: $script:CheckpointData and $script:IsResumeMode already set by Read-Checkpoint + + # Display resume summary + $completedCount = if ($checkpointData.partitions.completed) { $checkpointData.partitions.completed.Count } else { 0 } + $queryCreatedCount = if ($checkpointData.partitions.queryCreated) { $checkpointData.partitions.queryCreated.Count } else { 0 } + $totalPartitions = if ($checkpointData.partitions.total) { $checkpointData.partitions.total } else { 0 } + + Write-LogHost "" + Write-LogHost "Checkpoint loaded successfully:" -ForegroundColor Green + Write-LogHost " Original Run: $($checkpointData.runTimestamp)" -ForegroundColor White + $cpStartDate = if ($checkpointData.parameters.startDate) { $d = script:Parse-DateSafe $checkpointData.parameters.startDate; if ($d) { $d.ToString('yyyy-MM-dd') } else { 'Unknown' } } else { 'Unknown' } + $cpEndDate = if ($checkpointData.parameters.endDate) { $d = script:Parse-DateSafe $checkpointData.parameters.endDate; if ($d) { $d.ToString('yyyy-MM-dd') } else { 'Unknown' } } else { 'Unknown' } + Write-LogHost " Date Range: $cpStartDate to $cpEndDate" -ForegroundColor White + Write-LogHost " Total Partitions: $totalPartitions" -ForegroundColor White + Write-LogHost " Completed: $completedCount" -ForegroundColor Green + Write-LogHost " Query Created: $queryCreatedCount (will attempt data fetch)" -ForegroundColor Yellow + Write-LogHost " Remaining: $($totalPartitions - $completedCount)" -ForegroundColor Cyan + Write-LogHost "" + + # ============================================================ + # RESTORE ALL PARAMETERS FROM CHECKPOINT + # ============================================================ + Write-LogHost "Restoring parameters from checkpoint..." -ForegroundColor DarkGray + $cp = $checkpointData.parameters + + # Restore original run timestamp so incremental files use consistent naming + # This ensures all partition files (original run + resumes) share the same timestamp + if ($checkpointData.runTimestamp) { + $global:ScriptRunTimestamp = $checkpointData.runTimestamp + Write-LogHost " Restored original run timestamp: $($global:ScriptRunTimestamp)" -ForegroundColor DarkGray + } + + # Date range (required) - using locale-safe parsing + $parsedStart = script:Parse-DateSafe $cp.startDate + if (-not $parsedStart) { throw "Failed to parse checkpoint startDate: $($cp.startDate)" } + $StartDate = $parsedStart.ToString('yyyy-MM-dd') + + $parsedEnd = script:Parse-DateSafe $cp.endDate + if (-not $parsedEnd) { throw "Failed to parse checkpoint endDate: $($cp.endDate)" } + $EndDate = $parsedEnd.ToString('yyyy-MM-dd') + + # Activity/Record filtering + if ($cp.activityTypes -and $cp.activityTypes.Count -gt 0) { $ActivityTypes = $cp.activityTypes } + if ($cp.recordTypes -and $cp.recordTypes.Count -gt 0) { $RecordTypes = $cp.recordTypes } + if ($cp.serviceTypes -and $cp.serviceTypes.Count -gt 0) { $ServiceTypes = $cp.serviceTypes } + if ($cp.userIds -and $cp.userIds.Count -gt 0) { $UserIds = $cp.userIds } + if ($cp.groupNames -and $cp.groupNames.Count -gt 0) { $GroupNames = $cp.groupNames } + + # Agent filtering + if ($cp.agentId -and $cp.agentId.Count -gt 0) { $AgentId = $cp.agentId } + if ($cp.agentsOnly) { $AgentsOnly = [switch]$true } + if ($cp.excludeAgents) { $ExcludeAgents = [switch]$true } + + # Prompt filtering + if ($cp.promptFilter) { $PromptFilter = $cp.promptFilter } + + # Schema/Explosion settings + if ($cp.explodeArrays) { $ExplodeArrays = [switch]$true } + if ($cp.explodeDeep) { $ExplodeDeep = [switch]$true } + if ($cp.flatDepth) { $FlatDepth = $cp.flatDepth } + if ($cp.streamingSchemaSample) { $StreamingSchemaSample = $cp.streamingSchemaSample } + if ($cp.streamingChunkSize) { $StreamingChunkSize = $cp.streamingChunkSize } + # Allow user to override explosion threads on resume (different machine/load) + if (-not $PSBoundParameters.ContainsKey('ExplosionThreads') -and $cp.explosionThreads) { $ExplosionThreads = $cp.explosionThreads } + + # M365/User info bundles + if ($cp.includeM365Usage) { $IncludeM365Usage = [switch]$true } + if ($cp.includeUserInfo) { $IncludeUserInfo = [switch]$true } + if ($cp.includeDSPMForAI) { $IncludeDSPMForAI = [switch]$true } + if ($cp.includeCopilotInteraction) { $IncludeCopilotInteraction = [switch]$true } + if ($cp.excludeCopilotInteraction) { $ExcludeCopilotInteraction = [switch]$true } + + # Partitioning + if ($cp.blockHours) { $BlockHours = $cp.blockHours } + if ($cp.partitionHours) { $PartitionHours = $cp.partitionHours } + if ($cp.maxPartitions) { $MaxPartitions = $cp.maxPartitions } + + # Output settings + if ($cp.outputPath) { $OutputPath = $cp.outputPath } + if ($cp.exportWorkbook) { $ExportWorkbook = [switch]$true } + if ($cp.combineOutput) { $CombineOutput = [switch]$true } + + # Auth - only restore if user didn't override + if (-not $PSBoundParameters.ContainsKey('Auth') -and $cp.auth) { $Auth = $cp.auth } + if (-not $PSBoundParameters.ContainsKey('TenantId') -and $cp.tenantId) { $TenantId = $cp.tenantId } + if (-not $PSBoundParameters.ContainsKey('ClientId') -and $cp.clientId) { $ClientId = $cp.clientId } + + # Other settings + if ($cp.resultSize) { $ResultSize = $cp.resultSize } + if ($cp.maxConcurrency) { $MaxConcurrency = $cp.maxConcurrency } + # MaxMemoryMB: Allow user override on resume (different machine may have different RAM) + if (-not $PSBoundParameters.ContainsKey('MaxMemoryMB') -and $null -ne $cp.maxMemoryMB) { + $MaxMemoryMB = $cp.maxMemoryMB + Write-LogHost " Restored MaxMemoryMB from checkpoint: $MaxMemoryMB" -ForegroundColor DarkGray + } + if ($cp.useEOM) { $UseEOM = [switch]$true } + if ($cp.autoCompleteness) { $AutoCompleteness = [switch]$true } + if ($cp.includeTelemetry) { $IncludeTelemetry = [switch]$true } + + # Set the partial output path from checkpoint + $checkpointDir = Split-Path $script:CheckpointPath -Parent + $script:PartialOutputPath = Join-Path $checkpointDir $checkpointData.outputFiles.partialCsv + + # Set FinalOutputPath by stripping _PARTIAL from the partial path + $partialBaseName = [System.IO.Path]::GetFileNameWithoutExtension($script:PartialOutputPath) + $partialExt = [System.IO.Path]::GetExtension($script:PartialOutputPath) + $finalBaseName = $partialBaseName -replace '_PARTIAL$', '' + $script:FinalOutputPath = Join-Path $checkpointDir "${finalBaseName}${partialExt}" + + # Set OutputFile and CsvOutputFile to the partial path for execution + $OutputFile = $script:PartialOutputPath + # For ExportWorkbook mode, CsvOutputFile must use .csv extension (not .xlsx from PartialOutputPath) + if ($ExportWorkbook) { + $csvBaseName = [System.IO.Path]::GetFileNameWithoutExtension($script:PartialOutputPath) + $script:CsvOutputFile = Join-Path $checkpointDir "${csvBaseName}.csv" + } else { + $script:CsvOutputFile = $script:PartialOutputPath + } + + # Check for incremental data files + $incrementalDir = Join-Path $checkpointDir ".pax_incremental" + $hasIncrementalData = (Test-Path $incrementalDir) -and @(Get-ChildItem -Path $incrementalDir -Filter "*.jsonl" -ErrorAction SilentlyContinue).Count -gt 0 + + if (-not (Test-Path $script:PartialOutputPath) -and -not $hasIncrementalData) { + Write-LogHost "WARNING: No partial data found (neither _PARTIAL.csv nor .pax_incremental files)" -ForegroundColor Yellow + Write-LogHost " Will start fresh data collection." -ForegroundColor Yellow + } + elseif ($hasIncrementalData) { + $incrementalFiles = @(Get-ChildItem -Path $incrementalDir -Filter "*.jsonl" -ErrorAction SilentlyContinue) + Write-LogHost "Found $($incrementalFiles.Count) incremental data file(s) in .pax_incremental/" -ForegroundColor Green + } + + # Set log file to match the _PARTIAL output file (deferred from earlier setup) + $logBaseName = [System.IO.Path]::GetFileNameWithoutExtension($script:PartialOutputPath) + $logDir = Split-Path $script:PartialOutputPath -Parent + $script:LogFile = Join-Path $logDir ("{0}.log" -f $logBaseName) + $LogFile = $script:LogFile + + # Check if original log exists + $logFileExisted = Test-Path $script:LogFile + + # Add clear resume session marker to the log (appends to existing log, or creates new) + $resumeMarker = @" +$(if (-not $logFileExisted) { "=== Portable Audit eXporter (PAX) - Purview Audit Log Exporter ===`n(Original log file was not found - this is a resumed session)`n" }) +============================================================================================================ + RESUME SESSION STARTED +============================================================================================================ + Resume Time (UTC): $((Get-Date).ToUniversalTime().ToString('yyyy-MM-dd HH:mm:ss')) UTC + Resume Time (Local): $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss') + Original Run: $($checkpointData.runTimestamp) + Checkpoint File: $(Split-Path $script:CheckpointPath -Leaf) + Partitions Completed: $completedCount / $totalPartitions + Partitions Remaining: $($totalPartitions - $completedCount) +============================================================================================================ + +"@ + Add-Content -Path $script:LogFile -Value $resumeMarker -Encoding UTF8 -ErrorAction SilentlyContinue + + # Flush any buffered log entries now that log file is set + if ($script:LogBuffer -and $script:LogBuffer.Count -gt 0) { + foreach ($entry in $script:LogBuffer) { + try { Add-Content -Path $script:LogFile -Value $entry -Encoding UTF8 -ErrorAction SilentlyContinue } catch {} + } + $script:LogBuffer.Clear() + } + + # Display restored settings summary + Write-LogHost "" + Write-LogHost "Restored settings from checkpoint:" -ForegroundColor DarkGray + Write-LogHost " Date Range: $StartDate to $EndDate" -ForegroundColor DarkGray + # Custom Activities: Show user-specified activity types (not including auto-added CopilotInteraction) + $customActivities = @($ActivityTypes | Where-Object { $_ -ne 'CopilotInteraction' }) + if ($customActivities.Count -gt 0) { + Write-LogHost " Custom Activities: $($customActivities -join ', ')" -ForegroundColor DarkGray + } else { + Write-LogHost " Custom Activities: None" -ForegroundColor DarkGray + } + # CopilotInteraction status + $copilotStatus = if ($ExcludeCopilotInteraction) { 'Excluded' } elseif ($ActivityTypes -contains 'CopilotInteraction') { 'Included' } else { 'Not included' } + Write-LogHost " CopilotInteraction: $copilotStatus" -ForegroundColor DarkGray + # M365 Usage status + $m365Status = if ($IncludeM365Usage) { 'Included' } else { 'Not included' } + Write-LogHost " M365 Usage: $m365Status" -ForegroundColor DarkGray + if ($ExplodeArrays -or $ExplodeDeep) { Write-LogHost " Explosion: $(if ($ExplodeDeep) { 'ExplodeDeep' } elseif ($ExplodeArrays) { 'ExplodeArrays' })" -ForegroundColor DarkGray } + if ($PSBoundParameters.ContainsKey('Auth')) { + Write-LogHost " Auth (override): $Auth" -ForegroundColor Yellow + } else { + Write-LogHost " Auth (restored): $Auth" -ForegroundColor DarkGray + } + Write-LogHost "" + + Write-LogHost "Resume mode initialized. Will continue from last checkpoint." -ForegroundColor Cyan + Write-LogHost "" + + # Re-run MaxMemoryMB resolution after checkpoint restore (may have restored -1 for auto-detect) + $script:ResolvedMaxMemoryMB = $MaxMemoryMB + if ($MaxMemoryMB -eq -1) { + try { + $totalRAM = [math]::Round((Get-CimInstance -ClassName Win32_ComputerSystem -ErrorAction SilentlyContinue).TotalPhysicalMemory / 1MB, 0) + $script:ResolvedMaxMemoryMB = [math]::Round($totalRAM * 0.75, 0) + Write-LogHost "Memory management (resume): Auto-detected ${totalRAM}MB total RAM -> limit $($script:ResolvedMaxMemoryMB)MB (75%)" -ForegroundColor Cyan + } catch { + $script:ResolvedMaxMemoryMB = 4096 + Write-LogHost "Memory management (resume): Could not detect system RAM, defaulting to 4096MB limit" -ForegroundColor Yellow + } + } elseif ($MaxMemoryMB -eq 0) { + Write-LogHost "Memory management (resume): DISABLED (-MaxMemoryMB 0)" -ForegroundColor DarkGray + } else { + Write-LogHost "Memory management (resume): Using $($script:ResolvedMaxMemoryMB)MB limit" -ForegroundColor DarkGray + } + # Re-evaluate memoryFlushEnabled with restored/resolved value + $script:memoryFlushEnabled = ($script:ResolvedMaxMemoryMB -gt 0) -and (-not $ExplodeDeep) -and (-not $ExplodeArrays) -and (-not $ForcedRawInputCsvExplosion) + } + + # Authentication and Entra data collection (live mode only) + if (-not $RAWInputCSV) { + $existingEOM = Get-Module -ListAvailable -Name ExchangeOnlineManagement | Sort-Object Version -Descending | Select-Object -First 1 + if (-not $existingEOM -and $UseEOM) { + Write-LogHost "Installing ExchangeOnlineManagement module..." -ForegroundColor Yellow + try { Install-Module -Name ExchangeOnlineManagement -Scope CurrentUser -Force -AllowClobber; Write-LogHost "Module installed successfully." -ForegroundColor Green } catch { Write-LogHost "Failed to install ExchangeOnlineManagement module: $($_.Exception.Message)" -ForegroundColor Red; exit 1 } + } + if ($UseEOM) { + Import-Module ExchangeOnlineManagement -Force + } + + # Use unified authentication function + Connect-PurviewAudit -AuthMethod $Auth -UseEOMMode $UseEOM + + # Fetch user directory and license data if requested (Graph API mode only) + $script:LicenseData = $null + $script:EntraUsersData = $null + if ($IncludeUserInfo -and -not $UseEOM) { + Write-LogHost "Fetching Entra user directory and license data..." -ForegroundColor Cyan + $script:LicenseData = Get-UserLicenseData + $script:EntraUsersData = Get-EntraUsersData + } + elseif ($IncludeUserInfo -and $UseEOM) { + Write-LogHost "WARNING: -IncludeUserInfo requires Graph API mode (not supported with -UseEOM)" -ForegroundColor Yellow + Write-LogHost " EntraUsers output will not be generated" -ForegroundColor Yellow + Write-LogHost "" + } + } + + # Skip all audit log queries when only exporting user data + if (-not $OnlyUserInfo) { + $allLogs = New-Object System.Collections.ArrayList + if ($RAWInputCSV) { + Write-LogHost "Replay mode enabled: ingesting raw Purview CSV '$RAWInputCSV' (no Graph/EOM connections)" -ForegroundColor Yellow + if (-not (Test-Path $RAWInputCSV)) { Write-LogHost "Replay file not found: $RAWInputCSV" -ForegroundColor Red; exit 1 } + $csvData = Import-Csv -Path $RAWInputCSV + Write-LogHost ("Replay rows ingested: {0}" -f $csvData.Count) -ForegroundColor DarkGray + try { + $sampleRow = $csvData | Select-Object -First 1 + $identity = if ($sampleRow.Id) { $sampleRow.Id } elseif ($sampleRow.RecordId) { $sampleRow.RecordId } else { [guid]::NewGuid().ToString() } + $rec = [pscustomobject]@{ + RecordType = $(try { [int]$sampleRow.RecordType } catch { 0 }) + CreationDate = $(if ($sampleRow.CreationDate) { $d = script:Parse-DateSafe $sampleRow.CreationDate; if ($d) { $d } else { Get-Date } } else { Get-Date }) + UserIds = @(if ($sampleRow.UserIds) { $sampleRow.UserIds } elseif ($sampleRow.UserId) { $sampleRow.UserId } else { $null }) + Operations = if ($sampleRow.Operations) { $sampleRow.Operations } elseif ($sampleRow.Operation) { $sampleRow.Operation } else { $null } + ResultStatus = $(try { $sampleRow.ResultStatus } catch { '' }) + ResultCount = 0 + Identity = $identity + IsValid = $true + ObjectState = '' + AuditData = $sampleRow.AuditData + Operation = if ($sampleRow.Operation) { $sampleRow.Operation } elseif ($sampleRow.Operations) { $sampleRow.Operations } else { $null } + UserId = if ($sampleRow.UserId) { $sampleRow.UserId } elseif ($sampleRow.UserIds) { $sampleRow.UserIds } else { $null } + } + $sampleOut = Convert-ToPurviewExplodedRecords -Record $rec -SkipMetrics + # sample row count preview removed (verbosity reduction) + # sample columns preview removed (verbosity reduction) + } catch { + Write-LogHost ("Replay sample conversion failed: {0}" -f $_.Exception.Message) -ForegroundColor DarkYellow + } + $applyDateFilter = ($PSBoundParameters.ContainsKey('StartDate') -or $PSBoundParameters.ContainsKey('EndDate')) + $applyActivityFilter = ($PSBoundParameters.ContainsKey('ActivityTypes') -and $ActivityTypes -and $ActivityTypes.Count -gt 0) + $startFilter = $null; $endFilter = $null + if ($applyDateFilter) { + if ($PSBoundParameters.ContainsKey('StartDate')) { try { $startFilter = [datetime]::ParseExact($StartDate, 'yyyy-MM-dd', $null) } catch {} } + if ($PSBoundParameters.ContainsKey('EndDate')) { try { $endFilter = [datetime]::ParseExact($EndDate, 'yyyy-MM-dd', $null) } catch {} } + } + $activitySet = $null + if ($applyActivityFilter) { $activitySet = New-Object System.Collections.Generic.HashSet[string] ([System.StringComparer]::OrdinalIgnoreCase); foreach ($a in $ActivityTypes) { if ($a) { [void]$activitySet.Add($a) } } } + $filteredRows = New-Object System.Collections.Generic.List[object] + foreach ($row in $csvData) { + $keep = $true + $creationRaw = $row.CreationDate + $creation = if ($creationRaw) { script:Parse-DateSafe $creationRaw } else { $null } + if ($applyDateFilter -and $creation) { + if ($startFilter -and $creation -lt $startFilter) { $keep = $false } + if ($endFilter -and $creation -ge $endFilter) { $keep = $false } + } + if ($keep -and $applyActivityFilter) { + $op = if ($row.Operation) { $row.Operation } elseif ($row.Operations) { $row.Operations } else { $null } + if (-not $op -or -not $activitySet.Contains([string]$op)) { $keep = $false } + } + if (-not $keep) { continue } + $auditData = $row.AuditData + $identity = if ($row.Id) { $row.Id } elseif ($row.RecordId) { $row.RecordId } else { [guid]::NewGuid().ToString() } + $rec = [pscustomobject]@{ + RecordType = $(try { [int]$row.RecordType } catch { 0 }) + CreationDate = $(if ($creation) { $creation } else { Get-Date }) + UserIds = @(if ($row.UserIds) { $row.UserIds } elseif ($row.UserId) { $row.UserId } else { $null }) + Operations = if ($row.Operations) { $row.Operations } elseif ($row.Operation) { $row.Operation } else { $null } + ResultStatus = $(try { $row.ResultStatus } catch { '' }) + ResultCount = 0 + Identity = $identity + IsValid = $true + ObjectState = '' + AuditData = $auditData + Operation = if ($row.Operation) { $row.Operation } elseif ($row.Operations) { $row.Operations } else { $null } + UserId = if ($row.UserId) { $row.UserId } elseif ($row.UserIds) { $row.UserIds } else { $null } + } + [void]$filteredRows.Add($row) + [void]$allLogs.Add($rec) + } + $ingested = $allLogs.Count + Write-LogHost ("Replay rows after filters: {0}" -f $ingested) -ForegroundColor DarkGray + # Inline export for replay: execute immediately, bypass downstream pipeline + try { + Invoke-ReplayInlineExport -Logs $allLogs + return + } + catch { + } + $queryPlan = @(); $sequentialGroups = 0; $parallelDecision = @{ Enabled = $false; Reason = 'Replay'; AutoEligible = $false }; $parallelOverallEnabled = $false + $script:metrics.TotalRecordsFetched = $ingested + $script:progressState.Query.Total = 1; $script:progressState.Query.Current = 1 + } + else { + # Live audit log query mode + + # Diagnostic query removed to reduce throttling overhead + # Authentication and connectivity will be validated on first actual query + + $script:targetUsers = @() + if ($UserIds -or $GroupNames) { + Write-LogHost ""; Write-LogHost "User/Group Filtering Enabled:" -ForegroundColor Cyan + if ($UserIds) { $script:targetUsers += $UserIds; Write-LogHost " Individual users: $($UserIds.Count)" -ForegroundColor DarkCyan } + if ($GroupNames) { + Write-LogHost " Expanding groups to individual users..." -ForegroundColor DarkCyan + foreach ($group in $GroupNames) { + $members = Expand-GroupToUsers -GroupIdentity $group -UseEOMMode $UseEOM + if ($members.Count -gt 0) { + $script:targetUsers += $members + } + } + } + $script:targetUsers = $script:targetUsers | Select-Object -Unique + Write-LogHost " Total target users after deduplication: $($script:targetUsers.Count)" -ForegroundColor Green; Write-LogHost "" + } + $startDateObj = [datetime]::ParseExact($StartDate, 'yyyy-MM-dd', $null) + $endDateObj = [datetime]::ParseExact($EndDate, 'yyyy-MM-dd', $null) + + if ($OnlyUserInfo) { + Write-LogHost "Fetching Entra user directory and license data only (no audit logs)..." -ForegroundColor Cyan + Write-LogHost "" + } else { + Write-LogHost "Starting enterprise-grade audit log search..." -ForegroundColor Yellow + Write-LogHost "Date range: $($startDateObj.ToString('yyyy-MM-dd')) (inclusive) to $($endDateObj.ToString('yyyy-MM-dd')) (exclusive)" -ForegroundColor Gray + Write-LogHost "Processing mode: $(if ($ExplodeDeep){'Deep Column Explosion (with Row Explosion)'} elseif ($ExplodeArrays){'Array Explosion'} else {'Standard 1:1'})" -ForegroundColor Gray + } + + # Adaptive block sizing only applies to EOM mode (Graph API uses partitioning instead) + if ($UseEOM -and -not $OnlyUserInfo) { + Write-LogHost ""; Write-LogHost "Initializing adaptive block sizing (EOM mode)..." -ForegroundColor Cyan + } + + # --- DSPM for AI: Build final ActivityTypes array (additive logic with exclusion override) --- + if ($IncludeDSPMForAI) { + Write-LogHost ""; Write-LogHost "=== DSPM for AI Configuration ===" -ForegroundColor Cyan; Write-LogHost "" + } else { + Write-LogHost "" # preserve a blank spacer line for readability without header + } + + $finalActivityTypes = @() + + # Step 1: Add explicit -ActivityTypes parameter values (if provided and not default) + if ($PSBoundParameters.ContainsKey('ActivityTypes') -and $ActivityTypes) { + foreach ($actType in $ActivityTypes) { + if ($actType -and $actType -ne '') { + $finalActivityTypes += $actType + } + } + if ($finalActivityTypes.Count -gt 0) { + Write-LogHost "Custom ActivityTypes provided: $($finalActivityTypes -join ', ')" -ForegroundColor Gray + } + } + + # Step 2: Add DSPM for AI activity types if switch enabled + if ($IncludeDSPMForAI) { + $finalActivityTypes += 'ConnectedAIAppInteraction' + $finalActivityTypes += 'AIInteraction' + # Only add AIAppInteraction if user didn't decline at PAYG prompt + if (-not $script:RemoveAIAppInteraction) { + $finalActivityTypes += 'AIAppInteraction' + } + if ($script:RemoveAIAppInteraction) { + Write-LogHost "DSPM for AI: Adding ConnectedAIAppInteraction, AIInteraction (AIAppInteraction removed per user choice)" -ForegroundColor Cyan + } else { + Write-LogHost "DSPM for AI: Adding ConnectedAIAppInteraction, AIInteraction, AIAppInteraction" -ForegroundColor Cyan + } + } + + # Step 3: Add CopilotInteraction when explicitly requested + if ($IncludeCopilotInteraction -and -not ($finalActivityTypes -contains $copilotBaseActivityType)) { + $finalActivityTypes += $copilotBaseActivityType + Write-LogHost "IncludeCopilotInteraction: Adding $copilotBaseActivityType (explicit request)" -ForegroundColor Cyan + } + + # Step 4: Add Microsoft 365 usage bundle when requested + if ($IncludeM365Usage) { + $finalActivityTypes += $m365UsageActivityBundle + Write-LogHost ("M365 Usage bundle: Adding {0} activity types across Exchange/SharePoint/OneDrive/Teams" -f $m365UsageActivityBundle.Count) -ForegroundColor Cyan + + $RecordTypes = @( + if ($RecordTypes) { $RecordTypes } + $m365UsageRecordBundle + ) | Where-Object { $_ } | Select-Object -Unique + if ($RecordTypes.Count -eq 0) { $RecordTypes = $null } + + # CRITICAL: Set ServiceTypes to NULL for IncludeM365Usage mode + # This prevents splitting into 4 workload passes (Exchange, SharePoint, OneDrive, Teams) + # and instead creates a single workload pass that queries all M365 operations together + $ServiceTypes = $null + Write-LogHost "M365 Usage bundle: ServiceTypes => NULL (single workload pass)" -ForegroundColor Gray + + if ($RecordTypes) { + Write-LogHost "M365 Usage bundle: RecordTypes => $($RecordTypes -join ', ')" -ForegroundColor Gray + } + } + if ($RAWInputCSV) { + Invoke-ReplayInlineExport -Logs $allLogs + # Skip the rest of the pipeline; replay handled + return + } + + # Step 5: BASE ACTIVITY TYPE - Add CopilotInteraction as default base type + # This is the core Microsoft 365 Copilot activity type (FREE, included in M365 Copilot licensing) + # Captures ALL M365 Copilot usage including Teams meetings, Word, Excel, PowerPoint, Outlook, etc. + # Auto-add when: + # 1. User didn't explicitly provide -ActivityTypes parameter (default behavior), OR + # 2. User specified any DSPM switch (implies Copilot context needed) + # Exception: Always respect -ExcludeCopilotInteraction (handled in Step 6) + $userProvidedCustomTypes = $PSBoundParameters.ContainsKey('ActivityTypes') + $userWantsDSPM = $IncludeDSPMForAI + if (-not $ExcludeCopilotInteraction) { + # Auto-add if no custom types provided OR if DSPM switches used (implies Copilot data needed) + if (-not $userProvidedCustomTypes -or $userWantsDSPM) { + # Add CopilotInteraction if not already present + if (-not ($finalActivityTypes -contains $copilotBaseActivityType)) { + $finalActivityTypes = @($copilotBaseActivityType) + $finalActivityTypes + } + } + } + + # Step 6: EXCLUSION OVERRIDE - Remove CopilotInteraction if -ExcludeCopilotInteraction is true + if ($ExcludeCopilotInteraction) { + $beforeExclusion = $finalActivityTypes.Count + $finalActivityTypes = $finalActivityTypes | Where-Object { $_ -ne $copilotBaseActivityType } + $afterExclusion = $finalActivityTypes.Count + if ($beforeExclusion -ne $afterExclusion) { + $removedCount = $beforeExclusion - $afterExclusion + Write-LogHost "EXCLUSION: Removed $removedCount M365 Copilot activity type (ExcludeCopilotInteraction switch)" -ForegroundColor Red + } + else { + Write-LogHost "EXCLUSION: No M365 Copilot type in list (ExcludeCopilotInteraction switch active)" -ForegroundColor DarkGray + } + } + + # Step 7: Deduplicate + $finalActivityTypes = $finalActivityTypes | Select-Object -Unique + + # Step 8: Validate array not empty + if ($finalActivityTypes.Count -eq 0) { + Write-LogHost "" + Write-LogHost "ERROR: No activity types remain after processing." -ForegroundColor Red + Write-LogHost "You excluded CopilotInteraction but provided no other activity types." -ForegroundColor Yellow + Write-LogHost "Please specify activity types using -ActivityTypes or -IncludeDSPMForAI switch." -ForegroundColor Yellow + Write-LogHost "" + exit 1 + } + + # Step 9: Apply to ActivityTypes variable + $ActivityTypes = $finalActivityTypes + + # Step 9b: Update Parameter Snapshot with final ActivityTypes (if it exists) + if ($paramSnapshot -and $paramSnapshot.Contains('ActivityTypes')) { + $paramSnapshot['ActivityTypes'] = ($ActivityTypes -join ';') + } + + # Step 10b: Display detailed output filenames now that ActivityTypes is finalized (if activity type switches were used) + if ($IncludeDSPMForAI -or $ExcludeCopilotInteraction) { + Write-LogHost "" + Write-LogHost "=== Output Files ===" -ForegroundColor Cyan + if ($ExportWorkbook) { + # Excel mode + $outputDir = if ($OutputPath) { $OutputPath } else { "C:\Temp\" } + if ($CombineOutput) { + $baseName = "Purview_Audit_CombinedUsageActivity" + if ($IncludeUserInfo -and -not $UseEOM) { $baseName += "_EntraUsers" } + $excelDescriptor = if ($IncludeUserInfo -and -not $UseEOM) { 'multi-tab workbook (CombinedActivity + EntraUsers_MAClicensing)' } else { 'single-tab workbook' } + Write-LogHost "Output File: ${outputDir}${baseName}_.xlsx ($excelDescriptor)" -ForegroundColor White + if ($IncludeUserInfo -and -not $UseEOM) { Write-LogHost " Tabs: CombinedActivity, EntraUsers_MAClicensing" -ForegroundColor Gray } + } else { + Write-LogHost "Output File: ${outputDir}Purview_Audit_MultiTab_.xlsx (multi-tab workbook)" -ForegroundColor White + if ($IncludeUserInfo -and -not $UseEOM) { Write-LogHost " Entra Users Tab: EntraUsers_MAClicensing" -ForegroundColor Gray } + } + } else { + # CSV mode + if ($CombineOutput) { + # Single combined CSV file + $displayPath = if ($script:FinalOutputPath) { $script:FinalOutputPath } else { $OutputFile } + Write-LogHost "Output File: $displayPath (combined - all activity types)" -ForegroundColor White + if ($IncludeUserInfo -and -not $UseEOM) { $entraFileLater = (Join-Path (Split-Path $displayPath -Parent) "EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.csv"); Write-LogHost " Entra Users File: $entraFileLater" -ForegroundColor Gray } + } else { + # Separate CSV files per activity type + $outputDir = Split-Path $OutputFile -Parent + $timestamp = [System.IO.Path]::GetFileNameWithoutExtension($OutputFile) -replace '.*_(\d{8}_\d{6}).*', '$1' + Write-LogHost "Output Directory: $outputDir\" -ForegroundColor White + Write-LogHost "Output Files: ${outputDir}\Purview_Audit__${timestamp}.csv" -ForegroundColor Gray + if ($IncludeUserInfo -and -not $UseEOM) { $entraFileSplit = "${outputDir}\EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.csv"; Write-LogHost " Entra Users: $entraFileSplit" -ForegroundColor Gray } + } + } + } + + # Display Parameter Snapshot (now shows final ActivityTypes after DSPM processing) + Write-LogHost "" + Write-LogHost "Parameter Snapshot:" -ForegroundColor Cyan + foreach ($k in $paramSnapshot.Keys) { Write-LogHost (" {0} = {1}" -f $k, $paramSnapshot[$k]) -ForegroundColor DarkGray } + Write-LogHost "" + + # Step 10: Display DSPM-specific options (only if DSPM switches are used) + if ($IncludeDSPMForAI -or $ExcludeCopilotInteraction) { + Write-LogHost "" + Write-LogHost "=== DSPM for AI Options ===" -ForegroundColor Cyan + if ($IncludeDSPMForAI) { + Write-LogHost " DSPM for AI activity types enabled (See billing information for details)" -ForegroundColor Cyan + } + if ($ExcludeCopilotInteraction) { + Write-LogHost " [!] M365 Copilot activity type excluded (CopilotInteraction)" -ForegroundColor Red + } +} # Step 11: Log conflict resolution if it occurred + if ($script:ConflictResolved) { + Write-LogHost "" + Write-LogHost "Conflict Resolution:" -ForegroundColor Yellow + if ($script:ConflictChoice -eq 'INCLUDE') { + Write-LogHost " User resolved conflict by choosing to INCLUDE M365 Copilot activity type." -ForegroundColor Green + Write-LogHost " -ExcludeCopilotInteraction switch overridden." -ForegroundColor Green + } + elseif ($script:ConflictChoice -eq 'EXCLUDE') { + Write-LogHost " User resolved conflict by choosing to EXCLUDE M365 Copilot activity type." -ForegroundColor Red + Write-LogHost " CopilotInteraction removed from ActivityTypes." -ForegroundColor Red + } + elseif ($script:ConflictChoice -eq 'EXCLUDE (Force mode)') { + Write-LogHost " Force mode enabled: M365 Copilot activity type excluded without prompt." -ForegroundColor Red + Write-LogHost " CopilotInteraction removed from ActivityTypes." -ForegroundColor Red + } + Write-LogHost "===================================" -ForegroundColor Cyan +} + + # --- Validate AppendFile requires single-file output --- + if ($AppendFile) { + # Count activity types being processed + $activityTypeCount = $ActivityTypes.Count + + # Validate single-file output requirement + $isExcelMode = $ExportWorkbook.IsPresent + $isCombineMode = $CombineOutput.IsPresent + $isSingleActivity = ($activityTypeCount -eq 1) + + if (-not ($isExcelMode -or $isCombineMode -or $isSingleActivity)) { + Write-Host "ERROR: -AppendFile requires single-file output mode" -ForegroundColor Red + Write-Host "" -ForegroundColor Yellow + Write-Host "You have $activityTypeCount activity types selected, which would create multiple output files." -ForegroundColor Yellow + Write-Host "Activity types: $($ActivityTypes -join ', ')" -ForegroundColor Yellow + Write-Host "" -ForegroundColor Yellow + Write-Host "Solutions (choose one):" -ForegroundColor Green + Write-Host " 1. Add -ExportWorkbook to create multi-tab Excel file (recommended)" -ForegroundColor Green + Write-Host " 2. Add -CombineOutput to merge all types into single CSV file" -ForegroundColor Green + Write-Host " 3. Specify only ONE activity type (use -ActivityTypes parameter)" -ForegroundColor Green + Write-Host "" -ForegroundColor Yellow + Write-Host "Examples:" -ForegroundColor Cyan + Write-Host " # Excel multi-tab (all activity types in one .xlsx file):" -ForegroundColor DarkGray + Write-Host " -AppendFile 'Report.xlsx' -ExportWorkbook" -ForegroundColor White + Write-Host "" -ForegroundColor DarkGray + Write-Host " # Combined CSV (all activity types merged into one .csv file):" -ForegroundColor DarkGray + Write-Host " -AppendFile 'Report.csv' -CombineOutput" -ForegroundColor White + Write-Host "" -ForegroundColor DarkGray + Write-Host " # Single activity type:" -ForegroundColor DarkGray + Write-Host " -AppendFile 'Report.csv' -ActivityTypes CopilotInteraction" -ForegroundColor White + exit 1 + } + + Write-LogHost "AppendFile validation: Single-file output confirmed" -ForegroundColor Green + if ($isExcelMode) { + Write-LogHost " Mode: Excel multi-tab workbook" -ForegroundColor DarkGray + } elseif ($isCombineMode) { + Write-LogHost " Mode: Combined CSV (all activity types merged)" -ForegroundColor DarkGray + } elseif ($isSingleActivity) { + Write-LogHost " Mode: Single activity type ($($ActivityTypes[0]))" -ForegroundColor DarkGray + } + Write-LogHost "" + } + # --- End AppendFile Validation --- + + # --- DSPM for AI: Excel Export Data Storage Initialization --- + if ($ExportWorkbook) { + $isDSPMEnabled = Test-DSPMFeaturesEnabled + + if ($CombineOutput) { + # Single-tab mode: Store all rows in one array + $combinedTabName = if ($isDSPMEnabled) { 'Combined_DSPM_Data' } else { 'CombinedUsageActivity' } + $script:ExcelExportData = @{ + $combinedTabName = @() + } + if ($IncludeUserInfo -and -not $UseEOM) { + Write-LogHost "Excel export: Combined mode (multi-tab: $combinedTabName + EntraUsers_MAClicensing)" -ForegroundColor Cyan + } else { + Write-LogHost "Excel export: Combined mode (single tab: $combinedTabName)" -ForegroundColor Cyan + } + } else { + # Multi-tab mode: Store rows by activity type + $script:ExcelExportData = @{} + Write-LogHost "Excel export: Multi-tab mode (separate tab per activity type)" -ForegroundColor Cyan + } + } + # --- End Excel Export Initialization --- + + # === Activity Types for This Run === + if (-not $OnlyUserInfo) { +Write-LogHost "" +Write-LogHost "=== Activity Types for This Run ===" -ForegroundColor Cyan +# Sort activity types: non-DSPM first, then DSPM types at the bottom +$dspmTypes = @('AIAppInteraction', 'AIInteraction', 'ConnectedAIAppInteraction') +$nonDspmTypes = $ActivityTypes | Where-Object { $_ -notin $dspmTypes } +$dspmInList = $ActivityTypes | Where-Object { $_ -in $dspmTypes } +$sortedActivityTypes = @($nonDspmTypes) + @($dspmInList) +foreach ($act in $sortedActivityTypes) { + Write-LogHost " • $act" -ForegroundColor White +} +Write-LogHost "" # Output mode display with format-specific defaults + # CSV: Default SEPARATE (granular analysis - separate files per activity type) + # Excel: Default SEPARATE (separate tabs per activity type) + # Both: -CombineOutput switch forces combined mode + + # Determine effective combine mode based on format and user input + if ($CombineOutput) { + # User explicitly specified -CombineOutput switch: use combined mode for both CSV and Excel + $csvCombineMode = $true + $excelSeparateMode = $false + } else { + # User didn't specify: use format-specific defaults (both default to separate) + $csvCombineMode = $false # CSV defaults to separate files + $excelSeparateMode = $true # Excel defaults to separated tabs (inverse logic) + } + + if ($ExportWorkbook) { + $outputMode = if ($excelSeparateMode) { + 'Separated tabs (one tab per activity type)' + } else { + 'Combined (single tab with all activity types)' + } + } else { + $outputMode = if ($csvCombineMode) { + 'Combined (single CSV file with all activity types)' + } else { + 'Separated files (one CSV per activity type)' + } + } + + Write-LogHost "Output mode: $outputMode" -ForegroundColor Gray + if (-not $ExportWorkbook) { + if ($csvCombineMode) { + $displayPath = if ($script:FinalOutputPath) { $script:FinalOutputPath } else { $OutputFile } + $outputDir = Split-Path $displayPath -Parent + Write-LogHost "Output file: $displayPath" -ForegroundColor Gray + if ($IncludeUserInfo) { Write-LogHost "EntraUsers file (separate): $($OutputDir)\EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.csv" -ForegroundColor Gray } + } else { + $outputDir = Split-Path $OutputFile -Parent + # Prefix for per-activity CSV files + $filePrefix = "Purview_Audit" + Write-LogHost "Output directory: $outputDir" -ForegroundColor Gray + Write-LogHost "Activity file pattern: ${filePrefix}__${global:ScriptRunTimestamp}.csv" -ForegroundColor Gray + if ($IncludeUserInfo) { Write-LogHost "EntraUsers file: EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.csv" -ForegroundColor Gray } + } + } + Write-LogHost "===================================" -ForegroundColor Cyan + } # End if (-not $OnlyUserInfo) - Activity types display and output configuration + + Write-LogHost "" + + if (-not $RAWInputCSV) { + $allLogs = New-Object System.Collections.ArrayList + if (-not $UseEOM -and $ServiceTypes -and $ServiceTypes.Count -gt 0) { + $serviceRuns = $ServiceTypes + } else { + $serviceRuns = @($null) + } + if (-not $UseEOM -and $serviceRuns.Count -gt 1) { + Write-LogHost ("NOTE: security audit API accepts a single workload per query. Splitting run into {0} workload pass(es): {1}" -f $serviceRuns.Count, ($serviceRuns -join ', ')) -ForegroundColor Yellow + } + $servicePassIndex = 0 + } + else { + if (-not $serviceRuns) { $serviceRuns = @($null) } + } + foreach ($currentServiceFilter in $serviceRuns) { + $servicePassIndex++ + if (-not $UseEOM -and $currentServiceFilter) { + $workloadLabel = $currentServiceFilter + # Enhanced workload label when first Exchange pass includes cross-workload M365 usage types + if ($IncludeM365Usage -and $currentServiceFilter -eq 'Exchange' -and $servicePassIndex -eq 1) { + $workloadLabel = "$currentServiceFilter + M365 Usage (Office apps, Forms, Stream, Planner, PowerApps)" + } + Write-LogHost ("--- Processing workload {0}/{1}: {2} ---" -f $servicePassIndex, $serviceRuns.Count, $workloadLabel) -ForegroundColor DarkCyan + } + + $serviceActivities = $ActivityTypes + if ($currentServiceFilter -and $serviceOperationMap.ContainsKey($currentServiceFilter)) { + $mappedOps = @($serviceOperationMap[$currentServiceFilter] | Where-Object { $ActivityTypes -contains $_ }) + if ($mappedOps.Count -gt 0 -and $mappedOps.Count -lt $ActivityTypes.Count) { + $otherOps = @($ActivityTypes | Where-Object { $mappedOps -notcontains $_ }) + if ($otherOps.Count -gt 0) { + Write-LogHost (" Aligning operationFilters for {0}: queued for this workload only; other operations run in their own pass -> {1}" -f $currentServiceFilter, ($otherOps -join ', ')) -ForegroundColor DarkGray + } + } + if ($mappedOps.Count -gt 0) { + $serviceActivities = $mappedOps + } + } + + $serviceRecordTypes = $RecordTypes + if ($currentServiceFilter -and $RecordTypes) { + $matched = @() + $unmatched = @() + foreach ($rt in $RecordTypes) { + $rtServices = $recordTypeWorkloadMap[$rt] + if ($null -eq $rtServices -or $rtServices.Count -eq 0) { + $matched += $rt + continue + } + + if ($rtServices -contains $currentServiceFilter) { + $matched += $rt + } else { + $unmatched += $rt + } + } + if ($matched.Count -gt 0) { + # Enhanced output for M365 Usage bundle in first Exchange pass + if ($IncludeM365Usage -and $currentServiceFilter -eq 'Exchange' -and $servicePassIndex -eq 1) { + $m365RecordTypes = @('OfficeNative','MicrosoftForms','MicrosoftStream','PlannerPlan','PlannerTask','PowerAppsApp') + $m365Matched = @($matched | Where-Object { $m365RecordTypes -contains $_ }) + $exchangeMatched = @($matched | Where-Object { $m365RecordTypes -notcontains $_ }) + + if ($exchangeMatched.Count -gt 0) { + Write-LogHost (" Applying recordTypeFilters for Exchange -> {0}" -f ($exchangeMatched -join ', ')) -ForegroundColor DarkGray + } + if ($m365Matched.Count -gt 0) { + Write-LogHost (" Including M365 Usage recordTypes (cross-workload) -> {0}" -f ($m365Matched -join ', ')) -ForegroundColor Cyan + } + } else { + Write-LogHost (" Applying recordTypeFilters for {0} -> {1}" -f $currentServiceFilter, ($matched -join ', ')) -ForegroundColor DarkGray + } + $serviceRecordTypes = $matched | Select-Object -Unique + } else { + $serviceRecordTypes = $null + } + + if ($unmatched.Count -gt 0) { + Write-LogHost (" Queueing recordTypeFilters for future workload pass(es) -> {0}" -f ($unmatched -join ', ')) -ForegroundColor DarkGray + } + } + + $script:CurrentServiceFilter = $currentServiceFilter + $queryPlan = @(Get-QueryPlan -RequestedActivities $serviceActivities) # Force array wrapper + $script:progressBlocksCompleted = 0; $script:progressBlockHoursSum = 0.0 + $script:progressState.Query.Current = 0 + + # Initial rough estimate for display - will be updated as each group is processed + $totalEstimatedBlocks = $queryPlan.Count + $script:progressState.Query.Total = [int]$totalEstimatedBlocks + Set-ProgressPhase -Phase 'Query' -Status "Planning queries: $($queryPlan.Count) groups" + Write-LogHost "Planned $($queryPlan.Count) query groups" -ForegroundColor DarkCyan + $sequentialGroups = 0 + $ps7 = ($PSVersionTable.PSVersion.Major -ge 7) + if (-not $ps7 -and $ParallelMode -ne 'Off') { $ParallelMode = 'Off' } + $parallelDecision = Get-ParallelActivationDecision -QueryPlan $queryPlan -ParallelMode $ParallelMode -MaxParallelGroups $MaxParallelGroups -MaxConcurrency $MaxConcurrency + $parallelOverallEnabled = $parallelDecision.Enabled + Write-LogHost ("ParallelMode requested: {0} | Effective: {1} ({2})" -f $ParallelMode, ($(if ($parallelOverallEnabled) { 'Enabled' } else { 'Disabled' })), $parallelDecision.Reason) -ForegroundColor DarkCyan + if ($ParallelMode -eq 'Auto' -and -not $parallelOverallEnabled) { Write-LogHost "WARNING: ParallelMode Auto requested but heuristics not met -> running sequential. Reason: $($parallelDecision.Reason)." -ForegroundColor Yellow } + if ($enableParallelSwitchUsed) { Write-LogHost "-EnableParallel switch detected -> setting ParallelMode to On" -ForegroundColor DarkYellow } + $groupIndex = 0 + foreach ($grp in $queryPlan) { + $groupIndex++ + + # Calculate partition hours using default or user override (Graph API mode only) + $effectivePartitionHours = $PartitionHours + $userSpecifiedPartitionHours = ($PartitionHours -gt 0) + if (-not $UseEOM -and $effectivePartitionHours -eq 0) { + # Default to 12-hour partitions for Graph API mode + $effectivePartitionHours = 12 + if ($paramSnapshot -and $paramSnapshot.Contains('PartitionHours')) { + $paramSnapshot['PartitionHours'] = "${effectivePartitionHours} (auto)" + } + } + + # Calculate partitions from PartitionHours (Graph API) or use degree of parallelism (EOM) + if (-not $UseEOM -and $effectivePartitionHours -gt 0) { + $totalHours = ($endDateObj - $startDateObj).TotalHours + $calculatedPartitions = [Math]::Ceiling($totalHours / $effectivePartitionHours) + + # Apply MaxPartitions cap with auto-adjustment + if ($calculatedPartitions -gt $MaxPartitions) { + if (-not $userSpecifiedPartitionHours) { + # User didn't specify PartitionHours, so auto-adjust to 12h to stay under cap + $effectivePartitionHours = 12 + $calculatedPartitions = [Math]::Ceiling($totalHours / $effectivePartitionHours) + Write-LogHost " Auto-adjusted to ${effectivePartitionHours}h partitions to respect MaxPartitions cap ($MaxPartitions)" -ForegroundColor Cyan + } + # If still over cap after adjustment, hard cap it + if ($calculatedPartitions -gt $MaxPartitions) { + $calculatedPartitions = $MaxPartitions + # CRITICAL: Recalculate effectivePartitionHours to ensure equal time slices + $effectivePartitionHours = $totalHours / $calculatedPartitions + Write-LogHost " Applying MaxPartitions cap: $calculatedPartitions partitions @ ${effectivePartitionHours}h each" -ForegroundColor Magenta + } + } + + $degree = $calculatedPartitions + } else { + # EOM mode or PartitionHours not enabled: use concurrency-based degree + $degree = [Math]::Min($grp.Concurrency, $MaxConcurrency) + } + + # Large query warning (Graph API mode only) + if (-not $UseEOM -and $effectivePartitionHours -gt 0) { + $daySpan = ($endDateObj - $startDateObj).TotalDays + if ($daySpan -gt 30 -or $degree -gt 50) { + Write-LogHost "" + Write-LogHost " [!] LARGE QUERY DETECTED" -ForegroundColor Yellow + Write-LogHost " Date Range: $([Math]::Round($daySpan, 1)) days | Partitions: $degree @ ${effectivePartitionHours}h" -ForegroundColor Yellow + Write-LogHost " Large queries may take several hours to complete." -ForegroundColor Yellow + Write-LogHost " Smaller date ranges or -PartitionHours will reduce processing time." -ForegroundColor Yellow + + # Warn about interactive auth token expiration for long-running queries + $authMethod = $script:SharedAuthState.AuthMethod + if ($authMethod -in 'weblogin', 'devicecode') { + Write-LogHost "" + Write-LogHost " ℹ️ TOKEN REFRESH NOTE (Interactive Auth)" -ForegroundColor Cyan + Write-LogHost " Tokens expire after ~60-90 minutes. For queries exceeding 1 hour:" -ForegroundColor Cyan + Write-LogHost " • Token will be automatically refreshed from MSAL cache" -ForegroundColor Gray + Write-LogHost " • If cache is unavailable, browser prompt may appear" -ForegroundColor Gray + Write-LogHost " • Progress is saved automatically - use -Resume if interrupted" -ForegroundColor Gray + } + Write-LogHost "" + } + + # EXTREME VOLUME WARNING: Detect scenarios likely to exceed token lifetime (P2 advisory) + # Triggers on: very long date range with many activity types, or very high partition counts + # These runs routinely take 4-8+ hours and are prone to 401 token expiration mid-run + $activityCount = $grp.Activities.Count + if (($daySpan -gt 60 -and $activityCount -gt 10) -or $degree -gt 120) { + $estimatedHours = [Math]::Round(($degree * 2.5) / 60, 1) # ~2.5 min per partition is a rough average + Write-LogHost "" -ForegroundColor Red + Write-LogHost " ╔══════════════════════════════════════════════════════════════╗" -ForegroundColor Red + Write-LogHost " ║ ⚠ EXTREME VOLUME WARNING ║" -ForegroundColor Red + Write-LogHost " ╚══════════════════════════════════════════════════════════════╝" -ForegroundColor Red + Write-LogHost " Date Range: $([Math]::Round($daySpan, 0)) days | Activity Types: $activityCount | Partitions: $degree" -ForegroundColor Red + Write-LogHost " Estimated run time: $estimatedHours+ hours (actual may vary significantly)" -ForegroundColor Yellow + Write-LogHost "" -ForegroundColor Yellow + Write-LogHost " This query volume is very likely to encounter token expiration (401)" -ForegroundColor Yellow + Write-LogHost " during processing. Recommendations:" -ForegroundColor Yellow + $authMethod = $script:SharedAuthState.AuthMethod + if ($authMethod -in 'weblogin', 'devicecode') { + Write-LogHost " ► STRONGLY recommend switching to -AppRegistration auth" -ForegroundColor Cyan + Write-LogHost " (enables automatic silent token refresh without browser prompts)" -ForegroundColor Gray + } + Write-LogHost " ► Progress is checkpointed automatically — use -Resume if interrupted" -ForegroundColor Cyan + Write-LogHost " ► Incremental data is saved to disk every 500 pages per partition" -ForegroundColor Cyan + Write-LogHost " ► Consider splitting into smaller date ranges (e.g., 30-day windows)" -ForegroundColor Cyan + Write-LogHost "" -ForegroundColor Red + } + } + + if (-not $UseEOM) { + # For single query group, show simplified message without listing all activity types + # (activity types are already shown in "Activity Types for This Run" section) + if ($queryPlan.Count -eq 1) { + Write-LogHost "Query Group: Combined ($($grp.Activities.Count) activity types, partitions=$degree @ ${effectivePartitionHours}h)" -ForegroundColor Yellow + } else { + Write-LogHost "Group: $($grp.Name) (partitions=$degree @ ${effectivePartitionHours}h)" -ForegroundColor Yellow + # Show activity types per group when there are multiple groups + Write-LogHost " Activity Types: $($grp.Activities -join ', ')" -ForegroundColor Gray + } + } + $requestedDegree = $degree + $totalPartitions = $degree # Total number of time partitions to create + $maxConcurrentPartitions = $degree # Maximum concurrent execution + if ($degree -gt $MaxConcurrency) { + $maxConcurrentPartitions = $MaxConcurrency # Cap concurrent execution + try { $script:metrics.PartitionCapsApplied++; if ($script:metrics.PartitionCapHighestRequested -lt $requestedDegree) { $script:metrics.PartitionCapHighestRequested = $requestedDegree } } catch {} + Write-LogHost " Applying concurrency cap ($MaxConcurrency): requested $requestedDegree -> $maxConcurrentPartitions concurrent (all $totalPartitions queued)" -ForegroundColor Magenta + } + $withinCap = $groupIndex -le $MaxParallelGroups + $canParallel = $parallelOverallEnabled -and $withinCap -and ($PSVersionTable.PSVersion.Major -ge 7) -and ($degree -gt 1) + + # Update progress total now that we know parallel mode and partition count + # For parallel: each partition = 1 progress unit, For sequential: use BlockHours + if ($canParallel) { + # Parallel mode: total progress = number of partitions + $script:progressState.Query.Total = $degree + } else { + # Sequential mode: calculate blocks based on BlockHours + $sequentialBlocks = 0 + foreach ($act in $grp.Activities) { + try { + $initialBlock = Get-OptimalBlockSize -ActivityType $act + if (-not $initialBlock -or $initialBlock -le 0) { $initialBlock = $BlockHours } + $rangeHours = ($endDateObj - $startDateObj).TotalHours + $blocks = [int][Math]::Ceiling($rangeHours / $initialBlock) + if ($blocks -lt 1) { $blocks = 1 } + $sequentialBlocks += $blocks + } catch { + $sequentialBlocks += 1 + } + } + if ($sequentialBlocks -lt 1) { $sequentialBlocks = 1 } + $script:progressState.Query.Total = $sequentialBlocks + } + + # Adaptive memory pressure logic - ONLY for EOM mode (Graph API has low memory footprint) + if (-not $DisableAdaptive -and $UseEOM) { + try { + $workingSetMB = [math]::Round(([System.Diagnostics.Process]::GetCurrentProcess().WorkingSet64 / 1MB),0) + if ($workingSetMB -gt $MemoryPressureMB -and $MaxConcurrency -gt 1) { + $old = $MaxConcurrency + $MaxConcurrency = [Math]::Max(1, $MaxConcurrency - 1) + # Reduce concurrent execution, not total partitions + if ($maxConcurrentPartitions -gt $MaxConcurrency) { $maxConcurrentPartitions = $MaxConcurrency } + $script:metrics.AdaptiveMemoryReductions++ + $script:metrics.AdaptiveEvents += "Memory pressure detected (${workingSetMB}MB > ${MemoryPressureMB}MB) reduced MaxConcurrency $old -> $MaxConcurrency" + Write-LogHost "Adaptive: Memory pressure ($workingSetMB MB) reducing MaxConcurrency to $MaxConcurrency" -ForegroundColor DarkYellow + } + } catch {} + } + + # Concurrency guidance handled earlier; no additional adjustment section required + + # Graph API mode: Can query multiple activities in single request + # EOM mode: Should only have one activity per group (enforced by Get-QueryPlan) + $activities = $grp.Activities # Array of activity types for this group + $activity = $grp.Activities[0] # Backward compatibility for single-activity logic + + $partitions = @() + if ($totalPartitions -gt 1) { + $totalHours = ($endDateObj - $startDateObj).TotalHours + # Use effectivePartitionHours for Graph API mode, or calculate from totalPartitions for EOM mode + if (-not $UseEOM -and $effectivePartitionHours -gt 0) { + $sliceHours = $effectivePartitionHours + } else { + $sliceHours = $totalHours / $totalPartitions + } + + # CRITICAL: When IncludeM365Usage is active, NEVER send recordTypes or serviceFilter + # Graph API rejects mixed cross-workload recordTypes with workload-specific serviceFilter + $partitionRecordTypes = $serviceRecordTypes + $partitionServiceFilter = $currentServiceFilter + if ($IncludeM365Usage) { + $partitionRecordTypes = $null + $partitionServiceFilter = $null + } + + for ($pi = 0; $pi -lt $totalPartitions; $pi++) { + $pStart = $startDateObj.AddHours($sliceHours * $pi) + $pEnd = if ($pi -eq ($totalPartitions - 1)) { $endDateObj } else { $startDateObj.AddHours($sliceHours * ($pi + 1)) } + $partitions += [pscustomobject]@{ + Activities = $activities # Pass all activities for Graph API + Activity = $activity # Backward compatibility + PStart = $pStart + PEnd = $pEnd + Index = ($pi + 1) + Total = $totalPartitions + RecordTypes = $partitionRecordTypes + ServiceFilter = $partitionServiceFilter + } + } + } else { + $partitions += [pscustomobject]@{ + Activities = $activities # Pass all activities for Graph API + Activity = $activity # Backward compatibility + PStart = $startDateObj + PEnd = $endDateObj + Index = 1 + Total = 1 + RecordTypes = $partitionRecordTypes + ServiceFilter = $partitionServiceFilter + } + } + + # ======================================== + # RESUME MODE: FILTER PARTITIONS + # ======================================== + # Skip already-completed partitions from checkpoint + $originalPartitionCount = $partitions.Count + $skippedPartitions = @() + $fetchOnlyPartitions = @() + + if ($script:CheckpointData -and $script:IsResumeMode) { + $partitionCategories = Get-PartitionsToProcess -AllPartitions $partitions + + if ($partitionCategories.ToSkip.Count -gt 0) { + $skippedPartitions = $partitionCategories.ToSkip + # Store the originally-skipped partition indices for summary display + # (This is captured BEFORE processing, so it only includes checkpoint-completed partitions) + $script:OriginallySkippedPartitionIndices = @($skippedPartitions | ForEach-Object { $_.Index }) + Write-LogHost " [RESUME] Skipping $($skippedPartitions.Count) already-completed partition(s): $($skippedPartitions.Index -join ', ')" -ForegroundColor Green + } + + if ($partitionCategories.ToFetchOnly.Count -gt 0) { + $fetchOnlyPartitions = $partitionCategories.ToFetchOnly + Write-LogHost " [RESUME] $($fetchOnlyPartitions.Count) partition(s) have pending queries - will fetch data only" -ForegroundColor Yellow + } + + if ($partitionCategories.ToCreateAndFetch.Count -gt 0) { + Write-LogHost " [RESUME] $($partitionCategories.ToCreateAndFetch.Count) partition(s) need full processing" -ForegroundColor Cyan + } + + # Replace partitions with only those needing work (fetch-only + create-and-fetch) + $partitions = @() + $partitions += $partitionCategories.ToFetchOnly + $partitions += $partitionCategories.ToCreateAndFetch + + if ($partitions.Count -eq 0) { + Write-LogHost " [RESUME] All partitions already completed! Skipping query group." -ForegroundColor Green + continue # Skip to next query group + } + + Write-LogHost " [RESUME] Processing $($partitions.Count) of $originalPartitionCount partition(s)" -ForegroundColor Cyan + } + + # Parallel processing using Start-ThreadJob + # CRITICAL: EOM mode is NOT compatible with parallel processing (implicit remoting session conflicts) + # Graph API mode uses REST calls which are thread-safe and session-independent + if ($canParallel) { + # ======================================== + # DUAL-MODE PARALLEL VALIDATION + # ======================================== + + if ($UseEOM) { + # EOM mode + parallel = guaranteed failure due to implicit remoting + Write-LogHost " ERROR: Parallel processing is not compatible with -UseEOM mode" -ForegroundColor Red + Write-LogHost " Reason: Exchange Online implicit remoting cannot be safely shared across ThreadJobs" -ForegroundColor Yellow + Write-LogHost " This combination should have been blocked by validation - falling back to sequential" -ForegroundColor Yellow + $canParallel = $false + } + else { + # Graph API mode - parallel is safe + Write-LogHost " Processing partitions in parallel (Graph API ThreadJobs, Max=$maxConcurrentPartitions)..." -ForegroundColor Cyan + } + } + + if ($canParallel) { + try { + # ======================================== + # GRAPH API PARALLEL EXECUTION + # ======================================== + # Uses ThreadJobs with REST API calls (no session dependency) + + # Get Graph context and access token to pass to threads + $mgContext = Get-MgContext -ErrorAction Stop + if (-not $mgContext) { + Write-LogHost " ERROR: Not connected to Microsoft Graph" -ForegroundColor Red + $canParallel = $false + } + + # Get access token for Graph API calls in threads + $accessToken = $null + try { + # For AppRegistration auth ONLY: proactively refresh if token is approaching expiration + # AppRegistration can refresh automatically without user interaction + # Token lifetime is typically 60-90 minutes; refresh proactively at 30 minutes for safety buffer + if ($script:AuthConfig.CanReauthenticate -and $script:AuthConfig.Method -eq 'AppRegistration') { + $tokenAge = $null + if ($script:AuthConfig.TokenIssueTime) { + $tokenAge = (Get-Date) - $script:AuthConfig.TokenIssueTime + } + + # Refresh if token is older than 30 minutes (proactive, well before ~60 min expiry) + if ($tokenAge -and $tokenAge.TotalMinutes -gt 30) { + Write-LogHost " [TOKEN] Token age: $([Math]::Round($tokenAge.TotalMinutes, 1)) minutes - proactively refreshing..." -ForegroundColor Yellow + $refreshResult = Invoke-TokenRefresh -Force + if ($refreshResult.Success -and $refreshResult.NewToken) { + $accessToken = $refreshResult.NewToken + $script:AuthConfig.TokenIssueTime = Get-Date # Reset age timer + Write-LogHost " [TOKEN] Fresh token obtained for partition launch" -ForegroundColor Cyan + } + else { + Write-LogHost " [TOKEN] Proactive refresh failed, using current token: $($refreshResult.Message)" -ForegroundColor Yellow + } + } + elseif ($tokenAge) { + Write-LogHost " [TOKEN] Token age: $([Math]::Round($tokenAge.TotalMinutes, 1)) minutes - using current token" -ForegroundColor DarkGray + } + } + + # If we don't have a token yet, use reliable extraction helper + if (-not $accessToken) { + $accessToken = Get-GraphAccessToken + } + } + catch { + Write-LogHost " WARNING: Could not retrieve access token, parallel execution may fail" -ForegroundColor Yellow + } + + if ($canParallel) { + $jobs = @() + $jobMeta = @{} + + # Progress tracking uses actual partition count + $script:progressState.Query.Total = $partitions.Count + $script:progressState.Query.Current = 0 + + # Conditional message based on whether all partitions launch initially or some are queued + if ($maxConcurrentPartitions -ge $partitions.Count) { + Write-LogHost " Launching all $($partitions.Count) queries in parallel (initially may take several minutes)..." -ForegroundColor DarkCyan + } else { + Write-LogHost " Launching initial $maxConcurrentPartitions/$($partitions.Count) queries, remaining queued as slots free (initially may take several minutes)..." -ForegroundColor DarkCyan + } + + # Launch jobs asynchronously - we'll start monitoring after first batch is queued + $launchStartTime = Get-Date + $monitoringStarted = $false + $firstBatchSize = [Math]::Min($maxConcurrentPartitions, $partitions.Count) + + # Track which messages we've already shown (global deduplication) + $script:shownJobMessages = @{} + + # Track partition status for retry logic and final summary + $script:partitionStatus = @{} + + # Track page download progress per partition index (for STATUS display) + $script:partitionPageCounts = @{} + + # Track how many job output messages we've already processed per job (prevent O(n) iteration growth) + $script:jobOutputOffset = @{} + + # Track how many partitions were already complete before this run started (resume offset for STATUS display) + $script:resumeCompletedOffset = if ($script:IsResumeMode -and $script:CheckpointData.partitions.completed) { $script:CheckpointData.partitions.completed.Count } else { 0 } + foreach ($pt in $partitions) { + $script:partitionStatus[$pt.Index] = @{ + Partition = $pt + AttemptNumber = 1 + QueryId = $null + QueryName = $null + Status = 'NotStarted' # NotStarted, Sent, Complete, Failed, Subdivided, JobCreated + LastError = $null + RecordCount = 0 + } + } + + # Update checkpoint with partition count - only update total for fresh runs + if ($script:CheckpointData) { + if (-not $script:IsResumeMode) { + # Fresh run: set total from actual partition count + $script:CheckpointData.partitions.total = $partitions.Count + } + # For resume mode, keep original total from checkpoint + $script:CheckpointData.statistics.partitionsRemaining = $partitions.Count + Save-CheckpointToDisk + } + + # Track which partitions have had jobs created for them (for dynamic subdivision support) + $script:partitionsWithJobs = New-Object System.Collections.Generic.HashSet[int] + + # Define the ThreadJob scriptblock once for reuse in both initial and retry attempts + $queryJobScriptBlock = { + param($pStart, $pEnd, [array]$activity, $resultSize, $userIds, $idx, $tot, $sharedAuthState, $partition, $maxOutageMinutes, $apiVersion, $logPath, $existingQueryId, $incrementalDir, $runTimestamp, $memoryFlushEnabled) + # Suppress web request progress bar in job runspace + $ProgressPreference = 'SilentlyContinue' + + # Helper function to build audit API URIs with correct version + function Get-AuditUri { param($path) return "https://graph.microsoft.com/$apiVersion/security/auditLog/$path" } + + # Helper function to get current headers with fresh token from shared state + # This enables token refresh by main thread while job is running + function Get-CurrentHeaders { + param([string]$ClientRequestId) + return @{ + 'Authorization' = "Bearer $($sharedAuthState.Token)" + 'Content-Type' = 'application/json' + 'client-request-id' = $ClientRequestId + } + } + + # FIX C: Helper function to check if token is expired or near-expiry + # Returns $true if token is still valid with at least 2 minute buffer + function Test-TokenValid { + if (-not $sharedAuthState.ExpiresOn) { + # No expiry info - assume valid (will fail with 401 if not) + return $true + } + $expiresOn = $sharedAuthState.ExpiresOn + $now = (Get-Date).ToUniversalTime() + $bufferMinutes = 2 + $remainingMinutes = ($expiresOn - $now).TotalMinutes + return ($remainingMinutes -gt $bufferMinutes) + } + + # Wait-ForTokenRefresh: Called by thread jobs on 401/403 to wait for main thread token refresh + # Instead of retrying immediately with the same expired token, waits until $sharedAuthState.Token changes + # AppReg: max 120s (silent client_credentials refresh is fast) + # Interactive: max 86400s (24 hours - user may be away overnight) + function Wait-ForTokenRefresh { + $oldToken = $sharedAuthState.Token + $authMethod = $sharedAuthState.AuthMethod + $maxWaitSeconds = if ($authMethod -eq 'AppRegistration') { 120 } else { 86400 } + $checkInterval = if ($authMethod -eq 'AppRegistration') { 5 } else { 30 } + $waited = 0 + + Write-Output "[$(Get-Date -Format 'HH:mm:ss')] [P$idx] Auth failure detected - waiting for main thread token refresh ($authMethod mode, max ${maxWaitSeconds}s)..." + + while ($waited -lt $maxWaitSeconds) { + Start-Sleep -Seconds $checkInterval + $waited += $checkInterval + + # Check if token changed (main thread refreshed it) + if ($sharedAuthState.Token -ne $oldToken) { + Write-Output "[$(Get-Date -Format 'HH:mm:ss')] [P$idx] Token refreshed by main thread after ${waited}s wait. Resuming." + return $true + } + + # Periodic status for long waits (every 5 minutes) + if ($waited % 300 -eq 0) { + Write-Output "[$(Get-Date -Format 'HH:mm:ss')] [P$idx] Still waiting for token refresh... (${waited}s elapsed, max ${maxWaitSeconds}s)" + } + } + + Write-Output "[$(Get-Date -Format 'HH:mm:ss')] [P$idx] Token refresh wait timed out after ${maxWaitSeconds}s." + return $false + } + + $allRecords = [System.Collections.Generic.List[object]]::new() + $threadSavedToDisk = $false + $threadSavedFile = $null + $jobRunId = [System.Guid]::NewGuid().ToString('N').Substring(0, 8) # Unique per job execution — ensures retry flush files don't collide with original run files + $t0 = Get-Date + $queryId = $existingQueryId # Use existing QueryId if provided (for retry after 403 fetch failure) + $debugInfo = $null + + # Initialize telemetry tracking + $telemetry = [PSCustomObject]@{ + PartitionIndex = $idx + PartitionTotal = $tot + PartitionStart = $pStart.ToString('yyyy-MM-dd HH:mm:ss') + PartitionEnd = $pEnd.ToString('yyyy-MM-dd HH:mm:ss') + PartitionHours = [Math]::Round(($pEnd - $pStart).TotalHours, 2) + QueryCreatedAt = $null + InitialPollDelaySeconds = 0 + FirstRunningAt = $null + SucceededAt = $null + FirstPageAt = $null + LastPageAt = $null + PageCount = 0 + RowCount = 0 + ThrottledCount = 0 + RetryAfterTotalSeconds = 0 + ElapsedMinutes = 0 + Status = 'unknown' + SplitRequired = $false + PostFetch10KLimit = $false + PreemptiveSubdivision = $false + PreemptiveCount = 0 + SubdivisionReason = $null + PreviewRecordCount = 0 + ThrottleRetriesDuringCreation = 0 + } + + try { + # FIX C: Check token validity BEFORE starting any API work + # If token is expired or near-expiry, return early so main thread can retry with fresh token + if (-not (Test-TokenValid)) { + $telemetry.Status = 'token_expired' + Write-Output "[TOKEN-EXPIRED] Partition $idx/$tot - Token expired or near-expiry, returning for retry with fresh token" + return [pscustomobject]@{ + Activity = $activity + Logs = @() + RetrievedCount = 0 + ElapsedMs = 0 + Partition = $idx + Total = $tot + QueryId = $existingQueryId + DebugInfo = $null + Telemetry = $telemetry + TokenExpired = $true + } + } + + $activeRecordFilters = if ($partition.RecordTypes -and $partition.RecordTypes.Count -gt 0) { @($partition.RecordTypes) } else { $null } + $activeServiceFilter = $partition.ServiceFilter + + # Use all activities from partition + $queryActivities = if ($partition.Activities) { $partition.Activities } else { @($partition.Activity) } + # Log helper to capture current query payload state in debug stream (processed by parent thread) + function Write-GraphQueryDebug { + param( + [string]$Header, + [array]$Operations, + [array]$RecordFilters, + [string]$ServiceFilter, + [string]$PayloadJson + ) + $debugLines = @() + $debugLines += "$Header" + if ($RecordFilters -and $RecordFilters.Count -gt 0) { + $debugLines += " recordTypeFilters: $($RecordFilters -join ', ')" + } + if ($ServiceFilter) { + $debugLines += " serviceFilter: $ServiceFilter" + } + $debugLines += $PayloadJson + foreach ($line in $debugLines) { + Write-Output $line + } + } + + # Generate unique client-request-id for traceability (critical for Microsoft support) + $clientRequestId = [guid]::NewGuid().ToString() + + # Get headers with current token (will be refreshed via Get-CurrentHeaders for each API call) + $headers = Get-CurrentHeaders -ClientRequestId $clientRequestId + +# PREEMPTIVE SUBDIVISION CHECK: Disabled - Graph API queries endpoint doesn't support + # filtering by query payload properties, so we can't preemptively count records. + # Subdivision will happen post-fetch based on actual record counts. + $shouldSubdividePreemptively = $false + $preemptiveCountCheck = $null + + # Construct displayName with partition info: PAX_Query_YYYYMMDD_HHMM-YYYYMMDD_HHMM_PartX/Y + # Use last included minute (pEnd - 1 minute) since end date is exclusive + $pEndDisplay = $pEnd.AddMinutes(-1) + $displayName = "PAX_Query_$($pStart.ToString('yyyyMMdd_HHmm'))-$($pEndDisplay.ToString('yyyyMMdd_HHmm'))_Part$idx/$tot" + + # If preemptive subdivision is needed, skip query creation and return subdivision signal + if ($shouldSubdividePreemptively) { + $telemetry.Status = 'subdivided_preemptively' + $telemetry.SplitReason = "Count check returned $preemptiveCountCheck records (>= 9500 threshold)" + return [pscustomobject]@{ + Activity = $activity + Logs = @() + RetrievedCount = 0 + ElapsedMs = 0 + Partition = $idx + Total = $tot + QueryId = $null + DebugInfo = $null + Telemetry = $telemetry + } + } + + # SKIP CREATE if we already have a QueryId (retry after 403 fetch failure) + $skipCreate = $false + if ($queryId) { + $skipCreate = $true + Write-Output "[RETRY-FETCH] Partition $idx/$tot - Reusing existing QueryId: $queryId (skipping CREATE)" + $telemetry.QueryCreatedAt = 'reused_existing' + } + + # Convert outage tolerance to seconds BEFORE the skipCreate check so FETCH phase always has it + $maxNetworkOutageSeconds = $maxOutageMinutes * 60 + + # Retry loop for query creation with 429 handling (unlimited retries for throttling) + # Also handles transient network errors (502, 503, connection failures) with time-based tolerance + if (-not $skipCreate) { + try { # CREATE phase try-catch wrapper - handles CREATE-specific failures separately from FETCH errors + $createRetries = 0 + $createSuccess = $false + $networkErrorStart = $null + + # Build query body ONCE before retry loop + $queryBody = @{ + displayName = $displayName + filterStartDateTime = $pStart.ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ss.fffZ') + filterEndDateTime = $pEnd.ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ss.fffZ') + operationFilters = @($queryActivities) + } + + $queryBodyJson = $queryBody | ConvertTo-Json -Depth 5 + Write-GraphQueryDebug -Header "Graph API Query Body for partition $idx/$tot (Operations: $($queryActivities -join ', ')):" -Operations $queryActivities -RecordFilters $activeRecordFilters -ServiceFilter $activeServiceFilter -PayloadJson $queryBodyJson + + $attemptTimestamp = Get-Date -Format 'HH:mm:ss' + $attemptMessage = "[ATTEMPT] [$attemptTimestamp] Partition $idx/$tot - Starting query creation..." + Write-Host $attemptMessage -ForegroundColor DarkGray + + while (-not $createSuccess) { + try { + $queryUri = "https://graph.microsoft.com/$apiVersion/security/auditLog/queries" + $createResponse = Invoke-RestMethod -Method POST -Uri $queryUri -Headers (Get-CurrentHeaders -ClientRequestId $clientRequestId) -Body $queryBodyJson -ErrorAction Stop + $queryId = $createResponse.id + $telemetry.QueryCreatedAt = (Get-Date).ToString('yyyy-MM-dd HH:mm:ss') + + # LOG: Query successfully sent to server (output stream only, displayed by main thread) + $sentTimestamp = Get-Date -Format 'HH:mm:ss' + $sentMessage = "[SENT] [$sentTimestamp] Partition $idx/$tot - Query sent to Purview (QueryId: $queryId)" + Write-Output $sentMessage + + $createSuccess = $true + $networkErrorStart = $null # Reset network error timer on success + } + catch { + # CRITICAL: Check for transient network errors FIRST (502, 503, 504) - must be prioritized + # before 429 check to ensure these errors are always caught and retried correctly + $isNetworkError = $false + $errorMessage = $_.Exception.Message + $errorSummary = "Unknown error" + + if ($_.Exception.Response) { + $statusCode = $_.Exception.Response.StatusCode.value__ + if ($statusCode -ge 500) { + $isNetworkError = $true + $errorSummary = "$statusCode Server Error" + } + } + if (-not $isNetworkError -and ($errorMessage -match '5\d{2}|Bad Gateway|Service Unavailable|Gateway Timeout|Internal Server Error')) { + $isNetworkError = $true + $errorSummary = "Server error (from message)" + } + if (-not $isNetworkError -and ($errorMessage -match 'timed out|connection|unable to connect|could not be resolved')) { + $isNetworkError = $true + $errorSummary = "Network connectivity issue" + } + + # Check for 429 throttling (AFTER network error check) + $is429Create = $false + if (-not $isNetworkError) { + if ($_.Exception.Response) { + $statusCode = $_.Exception.Response.StatusCode + if ($statusCode -eq 429 -or $statusCode -eq 'TooManyRequests' -or $statusCode.value__ -eq 429) { + $is429Create = $true + } + } + if (-not $is429Create -and ($errorMessage -match '429' -or $errorMessage -match 'Too Many Requests')) { + $is429Create = $true + } + } + + # Check for 403 Forbidden (Microsoft service-side issue) + $is403Create = $false + if (-not $isNetworkError -and -not $is429Create) { + if ($_.Exception.Response) { + $statusCode = $_.Exception.Response.StatusCode + if ($statusCode -eq 403 -or $statusCode -eq 'Forbidden' -or $statusCode.value__ -eq 403) { + $is403Create = $true + } + } + if (-not $is403Create -and ($errorMessage -match '403' -or $errorMessage -match 'Forbidden')) { + $is403Create = $true + } + } + + # Handle different error types + if ($is403Create) { + # Extract diagnostic info from 403 response (critical for Microsoft support) + $responseBody403 = $null + $requestId403 = $null + $wwwAuth403 = $null + $isPermanent403 = $false + + # PowerShell 7 error handling: ErrorDetails.Message contains response body + try { + if ($_.ErrorDetails -and $_.ErrorDetails.Message) { + $responseBody403 = $_.ErrorDetails.Message + } + } catch {} + + # Try to get headers from the response + try { + if ($_.Exception.Response) { + # Try different header access patterns for PS7 compatibility + try { + $requestId403 = $_.Exception.Response.Headers['request-id'] + } catch { + try { + $requestId403 = $_.Exception.Response.Headers.GetValues('request-id') | Select-Object -First 1 + } catch {} + } + try { + $wwwAuth403 = $_.Exception.Response.Headers['WWW-Authenticate'] + } catch { + try { + $wwwAuth403 = $_.Exception.Response.Headers.GetValues('WWW-Authenticate') | Select-Object -First 1 + } catch {} + } + + # Fallback: try to read response stream if ErrorDetails was empty + if (-not $responseBody403) { + try { + $respStream = $_.Exception.Response.GetResponseStream() + if ($respStream -and $respStream.CanRead) { + $reader = New-Object System.IO.StreamReader($respStream) + $responseBody403 = $reader.ReadToEnd() + $reader.Dispose() + } + } catch {} + } + } + } catch {} + + # Also capture the full exception message as fallback + $exceptionMessage403 = $_.Exception.Message + + # Check if this is a PERMANENT 403 (don't retry these) + if ($responseBody403 -match 'InsufficientPrivileges|Authorization_RequestDenied|AccessDenied|InvalidAuthenticationToken') { + $isPermanent403 = $true + } + # Claims challenge indicates CAE - token needs refresh, not retry + if ($wwwAuth403 -match 'claims') { + $isPermanent403 = $true # Don't retry with same token, need fresh token + } + + # Log diagnostic info for Microsoft support + try { + $diagLog = "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] [403-DIAG] Partition $idx/$tot`n" + $diagLog += " client-request-id: $clientRequestId`n" + $diagLog += " request-id: $requestId403`n" + $diagLog += " exception: $exceptionMessage403`n" + if ($wwwAuth403) { $diagLog += " WWW-Authenticate: $wwwAuth403`n" } + if ($responseBody403) { $diagLog += " Response body: $responseBody403`n" } + $diagLog += " Permanent error: $isPermanent403" + $diagLog | Add-Content -Path $logPath -Encoding UTF8 -ErrorAction SilentlyContinue + } catch {} + + if ($isPermanent403) { + # Permanent 403 - don't retry, fail immediately for partition retry with fresh token + Write-Output "[403-PERM] Partition $idx/$tot - PERMANENT 403 (permissions/CAE) - Failing partition | request-id: $requestId403 | client-request-id: $clientRequestId" + throw "403 Forbidden (permanent) - $responseBody403" + } + + # Transient 403 - retry with exponential backoff + $createRetries++ + $max403Retries = 3 # Limited retries since we can't refresh token inside ThreadJob + + if ($createRetries -le $max403Retries) { + # Exponential backoff: 15s, 30s, 60s + $retryAfter = [Math]::Min(15 * [Math]::Pow(2, $createRetries - 1), 60) + + # Thread-safe file logging + try { + $logMsg = "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] [403-CREATE] Partition $idx/$tot - Transient 403 (Attempt $createRetries/$max403Retries) - Retrying in ${retryAfter}s" + $logMsg | Add-Content -Path $logPath -Encoding UTF8 -ErrorAction SilentlyContinue + } catch {} + + Write-Output "[403-CREATE] Partition $idx/$tot - Transient 403 (Attempt $createRetries/$max403Retries) - Retrying in ${retryAfter}s | request-id: $requestId403 | client-request-id: $clientRequestId" + Start-Sleep -Seconds $retryAfter + } else { + # Max 403 retries exceeded - throw to trigger partition-level retry (which will refresh token) + try { + $logMsg = "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] [403-CREATE] Partition $idx/$tot - Max transient 403 retries exceeded ($max403Retries), failing partition for retry with fresh token" + $logMsg | Add-Content -Path $logPath -Encoding UTF8 -ErrorAction SilentlyContinue + } catch {} + + Write-Output "[403-MAX] Partition $idx/$tot - Max transient 403 retries exceeded - Failing partition | request-id: $requestId403 | client-request-id: $clientRequestId" + throw "403 Forbidden (transient) - max retries exceeded (partition will retry with fresh token)" + } + } + elseif ($is429Create) { + $createRetries++ + # Throttling - get retry-after value + if ($_.Exception.Response.Headers -and $_.Exception.Response.Headers['Retry-After']) { + $retryAfter = [int]$_.Exception.Response.Headers['Retry-After'] + } + else { + # Exponential backoff: 60s, 120s, 240s, then cap at 300s (5 min) + $retryAfter = [Math]::Min(60 * [Math]::Pow(2, $createRetries - 1), 300) + } + + # Send real-time throttle notification to parent process + try { + $throttleNotification = [PSCustomObject]@{ + Partition = "$idx/$tot" + Attempt = $createRetries + RetryAfter = $retryAfter + Timestamp = (Get-Date) + } + $script:throttleNotifications.Enqueue($throttleNotification) + } + catch { + # Fallback to output if synchronized collection fails + Write-Output "[THROTTLE] Partition $idx/$tot - Query creation throttled (Attempt $createRetries) - Waiting $retryAfter seconds..." + } + + Start-Sleep -Seconds $retryAfter + + # Safety cap: prevent truly infinite throttle loops + if ($createRetries -ge 20) { + throw "Query creation throttled after $createRetries attempts - aborting partition" + } + } + elseif ($isNetworkError) { + # Network error - check if we're still within the outage tolerance window + if (-not $networkErrorStart) { + $networkErrorStart = Get-Date + # Log to file only (no terminal spam) + Write-Output "[NETWORK] Partition $idx/$tot - $errorSummary - Starting retry window (max ${maxOutageMinutes}m)" + Write-Output "[NETWORK] First network error for partition $idx/$tot : $errorMessage" + } + + $elapsedOutageSeconds = ((Get-Date) - $networkErrorStart).TotalSeconds + + if ($elapsedOutageSeconds -lt $maxNetworkOutageSeconds) { + $remainingMinutes = [Math]::Ceiling(($maxNetworkOutageSeconds - $elapsedOutageSeconds) / 60) + $retryDelay = 30 + (Get-Random -Minimum 10 -Maximum 30) # 30-60s random delay + + # Suppress subsequent retry messages to terminal (first error already shown) + # Full error details to log file only for troubleshooting + $elapsedFormatted = [Math]::Round($elapsedOutageSeconds, 1) + Write-Output "[NETWORK] Retry attempt for partition $idx/$tot (${elapsedFormatted}s elapsed) : $errorMessage" + + Start-Sleep -Seconds $retryDelay + } + else { + # Network outage exceeded tolerance + $outageMinutes = [Math]::Round($elapsedOutageSeconds / 60, 1) + Write-Output "[CREATE-FAILED] Partition $idx/$tot - Network outage exceeded $maxOutageMinutes minute tolerance (${outageMinutes}m elapsed) - will retry at end of run" + throw "Network outage exceeded $maxOutageMinutes minute tolerance during query creation" + } + } + else { + # Non-retriable error - log it and re-throw + $statusCode = $null + $bodyText = $null + try { + if ($_.Exception.Response) { + $statusCode = [int]$_.Exception.Response.StatusCode.value__ + } + } catch {} + if (-not $statusCode -and $_.Exception.Response) { + try { $statusCode = [int]$_.Exception.Response.StatusCode } catch {} + } + try { + if ($_.Exception.Response) { + $respStream = $_.Exception.Response.GetResponseStream() + if ($respStream) { + $reader = New-Object System.IO.StreamReader($respStream) + $bodyText = $reader.ReadToEnd() + $reader.Dispose() + } + } + } catch {} + + # No automatic filter fallback allowed – capture diagnostics only + + $errorDetails = "StatusCode: $(if ([string]::IsNullOrEmpty($statusCode)) { $_.Exception.Response.StatusCode } else { $statusCode }), Message: $($_.Exception.Message)" + Write-Host "[CREATE-FAILED] Partition $idx/$tot - Query creation failed: $errorDetails" -ForegroundColor Red + Write-Output "[ERROR] Partition $idx/$tot - Query creation failed (will retry at end of run): $errorDetails" + if ($bodyText) { + Write-Output "[GRAPH-ERROR] Partition $idx/$tot - Response body: $bodyText" + } + throw # Re-throw non-network, non-throttle errors + } + } + } + + # Track throttle retries for summary + $telemetry.ThrottleRetriesDuringCreation = $createRetries + + if (-not $queryId) { + throw "Failed to create query" + } + + # Log query details + $debugInfo = "Query ID: $queryId | DateRange: $($pStart.ToString('yyyy-MM-dd HH:mm')) to $($pEnd.ToString('yyyy-MM-dd HH:mm')) UTC | Activities: $($queryActivities -join ', ')" + } # End of CREATE phase try block + catch { + # CREATE phase failed - re-throw with CREATE-FAILED marker so outer catch knows this is not a FETCH error + $createError = $_.Exception.Message + $createStack = $_.ScriptStackTrace + + # Log CREATE failure + try { + $logMsg = "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] [CREATE-FAILED] Partition $idx/$tot - Query creation failed: $createError" + $logMsg | Add-Content -Path $logPath -Encoding UTF8 -ErrorAction SilentlyContinue + } catch {} + + # Re-throw with marker for outer catch to identify as CREATE failure + throw "[CREATE-FAILED] $createError" + } + } # End of if (-not $skipCreate) block + + # Step 2: Poll for completion (no artificial timeout — polls until Purview responds) + # Microsoft guidance: 30-60s intervals for enterprise-scale parallel execution + # Network outage tolerance (adaptive) using passed-in MaxNetworkOutageMinutes parameter + $effectiveOutage = if ($maxOutageMinutes -and $maxOutageMinutes -gt 0) { $maxOutageMinutes } else { 30 } + $netOutageStart = $null + $netErrorStreak = 0 + $netPatterns = @('timed out','connection','unable to connect','remote name could not be resolved','temporarily unavailable','network','500','502','503','504','bad gateway','gateway timeout','service unavailable','internal server error') + $lastNetHeartbeat = Get-Date + $lastNetMessage = $null # Throttle repetitive network messages + $netMessageMinInterval = 60 # Minimum seconds between network status messages + $pollCount = 0 + + $pollStartTime = Get-Date + $queryComplete = $false + + # Initial wait before first poll with randomization to prevent synchronization + $initialWaitSeconds = Get-Random -Minimum 30 -Maximum 60 + $telemetry.InitialPollDelaySeconds = $initialWaitSeconds + Start-Sleep -Seconds $initialWaitSeconds + + while (-not $queryComplete) { + $pollCount++ + + # Poll query status with 429 throttling detection + try { + # Pre-check: if token is already expired, wait for refresh before making a doomed API call + if (-not (Test-TokenValid)) { + Write-Output "[$(Get-Date -Format 'HH:mm:ss')] [P$idx] Token expired before POLL - waiting for main thread refresh..." + $null = Wait-ForTokenRefresh + } + $statusResponse = Invoke-RestMethod -Method GET -Uri (Get-AuditUri -path "queries/$queryId") ` + -Headers (Get-CurrentHeaders -ClientRequestId $clientRequestId) -ErrorAction Stop + # Reset outage tracking on success + if ($netOutageStart) { + $duration = (Get-Date) - $netOutageStart + # Only log recovery if outage lasted > 1 minute (ignore brief connection blips) + if ($duration.TotalMinutes -ge 1) { + Write-Host "[NET] Connectivity restored after $([Math]::Round($duration.TotalMinutes,1)) minutes - Partition $idx/$tot" -ForegroundColor Green + } + $netOutageStart = $null; $netErrorStreak = 0 + $lastNetHeartbeat = Get-Date # Reset heartbeat timer on recovery + # NOTE: Do NOT reset $lastNetMessage here - keep throttle window active + # to prevent message flooding during intermittent connectivity + } + } + catch { + # Check for 429 throttling using safe detection + $is429 = $false + if ($_.Exception.Response) { + $statusCode = $_.Exception.Response.StatusCode + if ($statusCode -eq 429 -or $statusCode -eq 'TooManyRequests' -or $statusCode.value__ -eq 429) { + $is429 = $true + } + } + if (-not $is429 -and ($_.Exception.Message -match '429' -or $_.Exception.Message -match 'Too Many Requests')) { + $is429 = $true + } + + if ($is429) { + # Track throttling in telemetry + $telemetry.ThrottledCount++ + # Respect Retry-After header if present, otherwise use 60s default + $retryAfter = 60 + if ($_.Exception.Response.Headers -and $_.Exception.Response.Headers['Retry-After']) { + $retryAfter = [int]$_.Exception.Response.Headers['Retry-After'] + } + $telemetry.RetryAfterTotalSeconds += $retryAfter + + # Log throttling event to user + Write-Host "[!] API Rate Limit (429) - Partition $idx/$tot - Retry in $retryAfter seconds (Throttle #$($telemetry.ThrottledCount))" -ForegroundColor Yellow + + Start-Sleep -Seconds $retryAfter + continue # Retry this poll + } + else { + # Check for 403 Forbidden (Microsoft service-side issue) + $is403Poll = $false + $pollErrMsg = $_.Exception.Message + if ($_.Exception.Response) { + $statusCode = $_.Exception.Response.StatusCode + if ($statusCode -eq 403 -or $statusCode -eq 'Forbidden' -or $statusCode.value__ -eq 403) { + $is403Poll = $true + } + } + if (-not $is403Poll -and ($pollErrMsg -match '403' -or $pollErrMsg -match 'Forbidden')) { + $is403Poll = $true + } + + if ($is403Poll) { + # Extract diagnostic info from 403 response + $responseBody403Poll = $null + $requestId403Poll = $null + $wwwAuth403Poll = $null + $isPermanent403Poll = $false + + try { + if ($_.Exception.Response) { + $requestId403Poll = $_.Exception.Response.Headers['request-id'] + $wwwAuth403Poll = $_.Exception.Response.Headers['WWW-Authenticate'] + $respStream = $_.Exception.Response.GetResponseStream() + if ($respStream) { + $reader = New-Object System.IO.StreamReader($respStream) + $responseBody403Poll = $reader.ReadToEnd() + $reader.Dispose() + } + } + } catch {} + + # Check if permanent 403 + if ($responseBody403Poll -match 'InsufficientPrivileges|Authorization_RequestDenied|AccessDenied|InvalidAuthenticationToken') { + $isPermanent403Poll = $true + } + if ($wwwAuth403Poll -match 'claims') { + $isPermanent403Poll = $true + } + + # Log diagnostic info + try { + $diagLog = "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] [403-POLL-DIAG] Partition $idx/$tot`n" + $diagLog += " client-request-id: $clientRequestId`n" + $diagLog += " request-id: $requestId403Poll`n" + if ($wwwAuth403Poll) { $diagLog += " WWW-Authenticate: $wwwAuth403Poll`n" } + if ($responseBody403Poll) { $diagLog += " Response body: $responseBody403Poll`n" } + $diagLog += " Permanent error: $isPermanent403Poll" + $diagLog | Add-Content -Path $logPath -Encoding UTF8 -ErrorAction SilentlyContinue + } catch {} + + if ($isPermanent403Poll) { + Write-Output "[403-PERM] Partition $idx/$tot - PERMANENT 403 on POLL - Failing partition | request-id: $requestId403Poll | client-request-id: $clientRequestId" + } + + # 403 during status polling - wait for main thread to refresh token first + $tokenRefreshed = Wait-ForTokenRefresh + if ($tokenRefreshed) { + # Token was refreshed - reset counter and retry immediately with fresh token + $script:poll403Count = 0 + Write-Output "[403-POLL] Partition $idx/$tot - Token refreshed, resetting retry counter and resuming poll" + continue # Retry this poll with new token + } + + # Token refresh failed/timed out - fall through to limited retry logic + if (-not $script:poll403Count) { $script:poll403Count = 0 } + $script:poll403Count++ + $max403Polls = 3 + + if ($script:poll403Count -le $max403Polls) { + $retryAfter = [Math]::Min(15 * [Math]::Pow(2, $script:poll403Count - 1), 60) + + # Thread-safe file logging + try { + $logMsg = "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] [403-POLL] Partition $idx/$tot - Transient 403 (Attempt $($script:poll403Count)/$max403Polls) - Retrying in ${retryAfter}s" + $logMsg | Add-Content -Path $logPath -Encoding UTF8 -ErrorAction SilentlyContinue + } catch {} + + Write-Output "[403-POLL] Partition $idx/$tot - Transient 403 (Attempt $($script:poll403Count)/$max403Polls) - Retrying in ${retryAfter}s | request-id: $requestId403Poll | client-request-id: $clientRequestId" + Start-Sleep -Seconds $retryAfter + continue # Retry this poll + } else { + # Max 403 retries exceeded - fail partition for retry with fresh token + try { + $logMsg = "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] [403-POLL] Partition $idx/$tot - Max transient 403 poll retries exceeded ($max403Polls), failing partition" + $logMsg | Add-Content -Path $logPath -Encoding UTF8 -ErrorAction SilentlyContinue + } catch {} + +Write-Output "[403-MAX] Partition $idx/$tot - Max transient 403 poll retries exceeded - Failing partition | request-id: $requestId403Poll | client-request-id: $clientRequestId" + throw "403 Forbidden during status poll - max retries exceeded" + } + } + + # Detect transient network outage vs hard failure + $errMsg = $_.Exception.Message + if ($netPatterns | Where-Object { $errMsg.ToLower().Contains($_) }) { + if (-not $netOutageStart) { $netOutageStart = Get-Date } + $netErrorStreak++ + $elapsedOutage = (Get-Date) - $netOutageStart + + # Throttle messages: only show if outage > 1 min OR no recent message + $shouldShowMessage = $false + if ($elapsedOutage.TotalMinutes -ge 1) { + # Sustained outage - show message but throttle to once per minute + if (-not $lastNetMessage -or ((Get-Date) - $lastNetMessage).TotalSeconds -ge $netMessageMinInterval) { + $shouldShowMessage = $true + } + } elseif ($netErrorStreak -eq 1 -and (-not $lastNetMessage -or ((Get-Date) - $lastNetMessage).TotalSeconds -ge $netMessageMinInterval)) { + # First error in a new outage window - show initial message + $shouldShowMessage = $true + } + + if ($shouldShowMessage) { + Write-Output "[NETWORK] Transient network issue (streak $netErrorStreak, outage $([Math]::Round($elapsedOutage.TotalMinutes,1))m) - Partition $idx/$tot" + $lastNetMessage = Get-Date + } + + # Adaptive backoff during outage + $waitSec = [Math]::Min(95, [Math]::Round(35 * [Math]::Pow(1.3, $netErrorStreak))) + Start-Sleep -Seconds $waitSec + + # Heartbeat every ~5 minutes of sustained outage (only if outage > 2 min) + if ($elapsedOutage.TotalMinutes -ge 2 -and ((Get-Date) - $lastNetHeartbeat).TotalMinutes -ge 5) { + Write-Output "[NETWORK] Still waiting on network recovery (outage $([Math]::Round($elapsedOutage.TotalMinutes,1))m, tolerance $effectiveOutage m)" + $lastNetHeartbeat = Get-Date + } + + if ($elapsedOutage.TotalMinutes -ge $effectiveOutage) { + throw "Network outage exceeded tolerance ($effectiveOutage minutes)" + } + continue + } + elseif ($statusCode -eq 401 -or $errorMessage -match '401|Unauthorized') { + # 401 Unauthorized during poll - token expired, wait for main thread refresh + Write-Output "[ERROR] Partition $idx/$tot - 401 Unauthorized during poll - waiting for token refresh" + $refreshed = Wait-ForTokenRefresh -TimeoutSeconds 120 -CheckIntervalSeconds 5 + if ($refreshed) { + Write-Output "[STATUS] Partition $idx/$tot - Token refreshed, retrying poll" + $currentToken = $SharedAuthState.AccessToken + continue # Retry the poll with fresh token + } else { + throw "401 Unauthorized during poll and token refresh timed out" + } + } + else { + throw # Non-transient, abort partition + } + } + } + + # Process status response + switch ($statusResponse.status) { + 'succeeded' { + $telemetry.Status = 'succeeded' + $telemetry.SucceededAt = (Get-Date).ToString('yyyy-MM-dd HH:mm:ss') + + # RECORD COUNT PREVIEW: Get exact count before fetching data (enables preemptive subdivision) + try { + $countUri = "https://graph.microsoft.com/$apiVersion/security/auditLog/queries?`$count=true&`$filter=queryId eq '$queryId'" + $countResponse = Invoke-RestMethod -Method GET -Uri $countUri -Headers (Get-CurrentHeaders -ClientRequestId $clientRequestId) -TimeoutSec 10 -ErrorAction Stop + $previewCount = $countResponse.'@odata.count' + Write-Output "[COUNT] Query $queryId succeeded - Actual record count: $previewCount" + $telemetry.PreviewRecordCount = $previewCount + if ($previewCount -ge 9500) { + $partitionHours = ($pEnd - $pStart).TotalHours + $minSubdivisionDays = 0.001389 # 2 minutes + $minSubdivisionHours = $minSubdivisionDays * 24 + + if ($partitionHours -gt $minSubdivisionHours) { + # Partition can be subdivided - flag for subdivision and skip record retrieval + $telemetry.PreemptiveSubdivision = $true + $telemetry.SubdivisionReason = "preview_count_$previewCount" + + Write-Host "[SUBDIVISION] Partition $idx/$tot - Preview count $previewCount >= 9500 - Subdividing partition ($([Math]::Round($partitionHours,2))h window)" -ForegroundColor Yellow + + # Return subdivision signal to parent + return [PSCustomObject]@{ + QueryId = $queryId + Status = 'needs_subdivision' + PreviewCount = $previewCount + PartitionStart = $pStart + PartitionEnd = $pEnd + PartitionIndex = $idx + PartitionTotal = $tot + RetrievedCount = 0 + Telemetry = $telemetry + } + } else { + Write-Host "[COUNT] Partition $idx/$tot - Preview count $previewCount (at minimum subdivision window $([Math]::Round($partitionHours,2))h, proceeding with fetch)" -ForegroundColor Cyan + } + } else { + Write-Host "[COUNT] Partition $idx/$tot - Preview count $previewCount (below threshold, proceeding with fetch)" -ForegroundColor DarkCyan + } + } catch { + # Count preview failed - proceed with normal fetch (non-critical failure) + Write-Output "[COUNT-WARN] Partition $idx/$tot - Count preview failed, proceeding with fetch: $($_.Exception.Message)" + } + + # Note: Don't write "Query succeeded" yet - wait until records are actually retrieved and validated + $queryComplete = $true + break + } + 'failed' { + $telemetry.Status = 'failed' + Write-Host "✗ Query failed - Partition $idx/$tot - Query ID: $queryId" -ForegroundColor Red + # Delete failed query from Purview to free server slot + if ($queryId) { + try { + Invoke-RestMethod -Method DELETE -Uri (Get-AuditUri -path "queries/$queryId") ` + -Headers (Get-CurrentHeaders -ClientRequestId $clientRequestId) -ErrorAction SilentlyContinue | Out-Null + } catch {} + } + throw "Query failed" + } + 'cancelled' { + $telemetry.Status = 'cancelled' + Write-Host "✗ Query cancelled - Partition $idx/$tot - Query ID: $queryId" -ForegroundColor Red + # Delete cancelled query from Purview to free server slot + if ($queryId) { + try { + Invoke-RestMethod -Method DELETE -Uri (Get-AuditUri -path "queries/$queryId") ` + -Headers (Get-CurrentHeaders -ClientRequestId $clientRequestId) -ErrorAction SilentlyContinue | Out-Null + } catch {} + } + throw "Query cancelled" + } + 'queued' { + # Query is waiting in backend queue for available execution slot + $telemetry.Status = 'queued' + Write-Host "⏳ Query queued (waiting for backend capacity) - Partition $idx/$tot - Retry in 60-90s" -ForegroundColor Cyan + # Use longer polling interval for queued state (60-90s) since backend capacity is limited + $waitSeconds = Get-Random -Minimum 60 -Maximum 90 + Start-Sleep -Seconds $waitSeconds + } + 'notStarted' { + # Query accepted but not yet started processing + $telemetry.Status = 'notStarted' + # Randomized polling interval (35-60s) + $waitSeconds = Get-Random -Minimum 35 -Maximum 60 + Start-Sleep -Seconds $waitSeconds + } + 'running' { + # Track first time we see running status + if (-not $telemetry.FirstRunningAt) { + $telemetry.FirstRunningAt = (Get-Date).ToString('yyyy-MM-dd HH:mm:ss') + Write-Output "[STATUS] Query running - Partition $idx/$tot - Started processing" + } + $telemetry.Status = 'running' + # Randomized polling interval (35-60s) to prevent synchronization + $waitSeconds = Get-Random -Minimum 35 -Maximum 60 + Start-Sleep -Seconds $waitSeconds + } + default { + $telemetry.Status = $statusResponse.status + Write-Host "? Query status: $($statusResponse.status) - Partition $idx/$tot - Retry in 35-60s" -ForegroundColor Magenta + # Randomized polling interval (35-60s) to prevent synchronization + $waitSeconds = Get-Random -Minimum 35 -Maximum 60 + Start-Sleep -Seconds $waitSeconds + } + } + } + + # Step 3: Retrieve records with pagination + $recordsUri = Get-AuditUri -path "queries/$queryId/records" + $fetchNetworkErrorStart = $null + $unexpectedProcessingError = $false + $unexpectedProcessingMessage = $null + $fetchErrorRetryCount = 0 + $maxFetchErrorRetries = 3 # Retry unexpected errors 3 times before giving up + + # CRITICAL: When resultSize=0, fetch unlimited records (don't check count) + # When resultSize>0, stop when we reach the limit (EOM mode behavior) + $pageFlushTotalCount = 0 + $pageFlushFilePath = $null + while ($recordsUri -and ($resultSize -eq 0 -or ($pageFlushTotalCount + $allRecords.Count) -lt $resultSize)) { + # Retry loop for record fetching with 429 and network error handling + $fetchRetries = 0 + $maxFetchRetries = 5 + $fetchSuccess = $false + + while ($fetchRetries -lt $maxFetchRetries -and -not $fetchSuccess) { + try { + $recordsResponse = Invoke-RestMethod -Method GET -Uri $recordsUri -Headers (Get-CurrentHeaders -ClientRequestId $clientRequestId) -ErrorAction Stop + $fetchSuccess = $true + # Reset network error tracking on success + $fetchNetworkErrorStart = $null + } + catch { + # Check for 429 throttling + $is429Fetch = $false + if ($_.Exception.Response) { + $statusCode = $_.Exception.Response.StatusCode + if ($statusCode -eq 429 -or $statusCode -eq 'TooManyRequests' -or $statusCode.value__ -eq 429) { + $is429Fetch = $true + } + } + if (-not $is429Fetch -and ($_.Exception.Message -match '429' -or $_.Exception.Message -match 'Too Many Requests')) { + $is429Fetch = $true + } + + # Check for network errors (502, 503, 504, connection issues) + $isNetworkFetch = $false + $fetchErrorMessage = $_.Exception.Message + $fetchErrorSummary = "Unknown error" + + if ($_.Exception.Response) { + $statusCode = $_.Exception.Response.StatusCode.value__ + if ($statusCode -ge 500) { + $isNetworkFetch = $true + $fetchErrorSummary = "$statusCode Server Error" + } + } + if (-not $isNetworkFetch -and ($fetchErrorMessage -match '5\d{2}|Bad Gateway|Service Unavailable|Gateway Timeout|Internal Server Error')) { + $isNetworkFetch = $true + $fetchErrorSummary = "Server error (from message)" + } + if (-not $isNetworkFetch -and ($fetchErrorMessage -match 'timed out|connection|unable to connect|could not be resolved')) { + $isNetworkFetch = $true + $fetchErrorSummary = "Network connectivity issue" + } + + # Check for 403 Forbidden (Microsoft service-side issue) + $is403Fetch = $false + if (-not $is429Fetch -and -not $isNetworkFetch) { + if ($_.Exception.Response) { + $statusCode = $_.Exception.Response.StatusCode + if ($statusCode -eq 403 -or $statusCode -eq 'Forbidden' -or $statusCode.value__ -eq 403) { + $is403Fetch = $true + } + } + if (-not $is403Fetch -and ($fetchErrorMessage -match '403' -or $fetchErrorMessage -match 'Forbidden')) { + $is403Fetch = $true + } + } + + if ($is403Fetch) { + # Extract diagnostic info from 403 response + $responseBody403Fetch = $null + $requestId403Fetch = $null + $wwwAuth403Fetch = $null + $isPermanent403Fetch = $false + $exceptionMessage403Fetch = $_.Exception.Message + + # PowerShell 7 error handling: ErrorDetails.Message contains response body + try { + if ($_.ErrorDetails -and $_.ErrorDetails.Message) { + $responseBody403Fetch = $_.ErrorDetails.Message + } + } catch {} + + # Try to get headers from the response + try { + if ($_.Exception.Response) { + # Try different header access patterns for PS7 compatibility + try { + $requestId403Fetch = $_.Exception.Response.Headers['request-id'] + } catch { + try { + $requestId403Fetch = $_.Exception.Response.Headers.GetValues('request-id') | Select-Object -First 1 + } catch {} + } + try { + $wwwAuth403Fetch = $_.Exception.Response.Headers['WWW-Authenticate'] + } catch { + try { + $wwwAuth403Fetch = $_.Exception.Response.Headers.GetValues('WWW-Authenticate') | Select-Object -First 1 + } catch {} + } + + # Fallback: try to read response stream if ErrorDetails was empty + if (-not $responseBody403Fetch) { + try { + $respStream = $_.Exception.Response.GetResponseStream() + if ($respStream -and $respStream.CanRead) { + $reader = New-Object System.IO.StreamReader($respStream) + $responseBody403Fetch = $reader.ReadToEnd() + $reader.Dispose() + } + } catch {} + } + } + } catch {} + + # Check if permanent 403 + if ($responseBody403Fetch -match 'InsufficientPrivileges|Authorization_RequestDenied|AccessDenied|InvalidAuthenticationToken') { + $isPermanent403Fetch = $true + } + if ($wwwAuth403Fetch -match 'claims') { + $isPermanent403Fetch = $true + } + + # Log diagnostic info + try { + $diagLog = "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] [403-FETCH-DIAG] Partition $idx/$tot Page $($telemetry.PageCount + 1)`n" + $diagLog += " client-request-id: $clientRequestId`n" + $diagLog += " request-id: $requestId403Fetch`n" + $diagLog += " exception: $exceptionMessage403Fetch`n" + if ($wwwAuth403Fetch) { $diagLog += " WWW-Authenticate: $wwwAuth403Fetch`n" } + if ($responseBody403Fetch) { $diagLog += " Response body: $responseBody403Fetch`n" } + $diagLog += " Permanent error: $isPermanent403Fetch" + $diagLog | Add-Content -Path $logPath -Encoding UTF8 -ErrorAction SilentlyContinue + } catch {} + + if ($isPermanent403Fetch) { + Write-Output "[403-PERM] Partition $idx/$tot - PERMANENT 403 on FETCH - Failing partition | request-id: $requestId403Fetch | client-request-id: $clientRequestId" + } + + # 403 Forbidden during FETCH - wait for main thread to refresh token first + $tokenRefreshed = Wait-ForTokenRefresh + if ($tokenRefreshed) { + # Token was refreshed - reset counter and retry immediately with fresh token + $fetchRetries = 0 + Write-Output "[403-FETCH] Partition $idx/$tot - Token refreshed, resetting retry counter and resuming fetch" + continue # Retry this fetch with new token + } + + # Token refresh failed/timed out - fall through to limited retry logic + $fetchRetries++ + $max403FetchRetries = 3 + + if ($fetchRetries -le $max403FetchRetries) { + # Exponential backoff: 15s, 30s, 60s + $retryAfter = [Math]::Min(15 * [Math]::Pow(2, $fetchRetries - 1), 60) + + # Thread-safe file logging + try { + $logMsg = "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] [403-FETCH] Partition $idx/$tot Page $($telemetry.PageCount + 1) - Transient 403 (Attempt $fetchRetries/$max403FetchRetries) - Retrying in ${retryAfter}s" + $logMsg | Add-Content -Path $logPath -Encoding UTF8 -ErrorAction SilentlyContinue + } catch {} + + Write-Output "[403-FETCH] Partition $idx/$tot Page $($telemetry.PageCount + 1) - Transient 403 (Attempt $fetchRetries/$max403FetchRetries) - Retrying in ${retryAfter}s | request-id: $requestId403Fetch | client-request-id: $clientRequestId" + Start-Sleep -Seconds $retryAfter + } else { + # Max 403 retries exceeded - throw to trigger partition-level retry + try { + $logMsg = "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] [403-FETCH] Partition $idx/$tot - Max transient 403 fetch retries exceeded ($max403FetchRetries), failing partition for retry with fresh token" + $logMsg | Add-Content -Path $logPath -Encoding UTF8 -ErrorAction SilentlyContinue + } catch {} + + Write-Output "[403-MAX] Partition $idx/$tot - Max transient 403 fetch retries exceeded - Failing partition | request-id: $requestId403Fetch | client-request-id: $clientRequestId" + throw "403 Forbidden (transient) on fetch - max retries exceeded (partition will retry with fresh token)" + } + } + elseif ($is429Fetch) { + $fetchRetries++ + $telemetry.ThrottledCount++ + # Respect Retry-After header if present, otherwise use 60s default + $retryAfter = 60 + if ($_.Exception.Response.Headers -and $_.Exception.Response.Headers['Retry-After']) { + $retryAfter = [int]$_.Exception.Response.Headers['Retry-After'] + } + $telemetry.RetryAfterTotalSeconds += $retryAfter + + # Log throttling event to user + Write-Host "[!] API Rate Limit (429) during record fetch - Partition $idx/$tot Page $($telemetry.PageCount + 1) - Retry in $retryAfter seconds (Attempt $fetchRetries/$maxFetchRetries)" -ForegroundColor Yellow + + Start-Sleep -Seconds $retryAfter + } + elseif ($isNetworkFetch) { + # Network error - check if we're still within the outage tolerance window + if (-not $fetchNetworkErrorStart) { + $fetchNetworkErrorStart = Get-Date + # Log to file only (no terminal spam) + Write-Output "[NETWORK] Partition $idx/$tot Page $($telemetry.PageCount + 1) - $fetchErrorSummary - Starting retry window (max ${maxOutageMinutes}m)" + } + + # Calculate elapsed outage time + $fetchElapsedOutageSeconds = ((Get-Date) - $fetchNetworkErrorStart).TotalSeconds + + if ($fetchElapsedOutageSeconds -lt $maxNetworkOutageSeconds) { + $fetchRemainingMinutes = [Math]::Ceiling(($maxNetworkOutageSeconds - $fetchElapsedOutageSeconds) / 60) + $fetchRetryDelay = 30 + (Get-Random -Minimum 10 -Maximum 30) # 30-60s random delay + + # Suppress subsequent retry messages to terminal (first error already shown) + # Log to file for troubleshooting + $fetchElapsedFormatted = [Math]::Round($fetchElapsedOutageSeconds, 1) + Write-Output "[NETWORK] Retry attempt for partition $idx/$tot Page $($telemetry.PageCount + 1) (${fetchElapsedFormatted}s elapsed) : $fetchErrorSummary" + + Start-Sleep -Seconds $fetchRetryDelay + } + else { + # Network outage exceeded tolerance + $fetchOutageMinutes = [Math]::Round($fetchElapsedOutageSeconds / 60, 1) + Write-Host "[ERROR] Partition $idx/$tot - Record fetch failed: Network outage exceeded $maxOutageMinutes minute tolerance (${fetchOutageMinutes}m elapsed)" -ForegroundColor Red + throw "Network outage exceeded $maxOutageMinutes minute tolerance during record fetch" + } + } + else { + throw # Re-throw non-throttling, non-network errors + } + } + } + + if (-not $fetchSuccess) { + throw "Failed to fetch records after $maxFetchRetries throttle retries" + } + + # Track page retrieval + $telemetry.PageCount++ + # Emit page count heartbeat on page 1 and every 200 pages so main thread can display progress in STATUS line + if ($telemetry.PageCount -eq 1 -or $telemetry.PageCount % 200 -eq 0) { + Write-Output "[PROGRESS] P${idx}/$tot pg$($telemetry.PageCount)" + } + if ($telemetry.PageCount -eq 1) { + $telemetry.FirstPageAt = (Get-Date).ToString('yyyy-MM-dd HH:mm:ss') + } + $telemetry.LastPageAt = (Get-Date).ToString('yyyy-MM-dd HH:mm:ss') + + if ($recordsResponse.value) { + foreach ($record in $recordsResponse.value) { + # Normalize to EOM-compatible format inline + # PERF: Store _ParsedAuditData to avoid re-parsing JSON during explosion + # NOTE: Using InvariantCulture directly here since Parse-DateSafe isn't available in ThreadJob scope + $normalized = [PSCustomObject]@{ + RecordType = $record.auditLogRecordType + CreationDate = if ($record.createdDateTime) { try { [datetime]::Parse($record.createdDateTime, [System.Globalization.CultureInfo]::InvariantCulture) } catch { $null } } else { $null } + UserIds = $record.userPrincipalName + Operations = $record.operation + AuditData = if ($record.auditData) { $record.auditData | ConvertTo-Json -Depth 100 -Compress } else { '{}' } + _ParsedAuditData = $record.auditData # Already-parsed object from Graph API + ResultIndex = $pageFlushTotalCount + $allRecords.Count + 1 + ResultCount = 1 + Identity = $record.id + IsValid = $true + ObjectState = 'Unchanged' + } # Apply UserIds filter if specified + $includeRecord = $true + if ($userIds -and $userIds.Count -gt 0) { + $includeRecord = $userIds -contains $normalized.UserIds + } + + if ($includeRecord) { + [void]$allRecords.Add($normalized) + } + + # Only break if resultSize > 0 and we've reached the limit + if ($resultSize -gt 0 -and ($pageFlushTotalCount + $allRecords.Count) -ge $resultSize) { + break + } + } + } + + # Check for next page + $recordsUri = if ($recordsResponse.'@odata.nextLink') { $recordsResponse.'@odata.nextLink' } else { $null } + + # PER-PAGE MEMORY FLUSH: Append current page to disk and clear in-memory batch + if ($memoryFlushEnabled -and $allRecords.Count -gt 0) { + try { + if (-not (Test-Path $incrementalDir)) { New-Item -ItemType Directory -Path $incrementalDir -Force | Out-Null } + if (-not $pageFlushFilePath) { + $pageFlushFilePath = Join-Path $incrementalDir "Part${idx}_${runTimestamp}_qid-${queryId}_${jobRunId}.jsonl" + } + $allRecords | ForEach-Object { $_ | ConvertTo-Json -Depth 10 -Compress } | Add-Content -Path $pageFlushFilePath -Encoding utf8 + $pageFlushTotalCount += $allRecords.Count + $allRecords.Clear() + $threadSavedToDisk = $true + } catch { + Write-Output "[FLUSH-WARN] Partition $idx/$tot - Page flush failed, keeping in memory: $($_.Exception.Message)" + } + } + + # ROLLING PERSISTENCE SAFETY NET: Write backup snapshot every 500 pages + # This is write-only — $allRecords is NOT cleared. Normal flow is unchanged. + # If the process crashes mid-pagination, these files exist for manual recovery. + if (-not $memoryFlushEnabled -and $incrementalDir -and $allRecords.Count -gt 0 -and $telemetry.PageCount -gt 0 -and $telemetry.PageCount % 500 -eq 0) { + try { + if (-not (Test-Path $incrementalDir)) { New-Item -ItemType Directory -Path $incrementalDir -Force | Out-Null } + $snapshotFile = Join-Path $incrementalDir "Part${idx}_${runTimestamp}_snapshot_page$($telemetry.PageCount)_$($allRecords.Count)records.jsonl" + $allRecords | ForEach-Object { $_ | ConvertTo-Json -Depth 10 -Compress } | Out-File -FilePath $snapshotFile -Encoding utf8 -Force + Write-Output "[SNAPSHOT] Partition $idx/$tot - Safety snapshot at page $($telemetry.PageCount): $($allRecords.Count) records written to disk" + } catch { + Write-Output "[SNAPSHOT-WARN] Partition $idx/$tot - Failed to write safety snapshot: $($_.Exception.Message)" + } + } + } + } + catch { + # Check if this is a CREATE-FAILED error bubbling up (not a fetch error) + $errorMsg = $_.Exception.Message + if ($errorMsg -match '^\[CREATE-FAILED\]') { + # Re-throw CREATE errors as-is - they should propagate out of the scriptblock + throw + } + + # This catch is for unexpected errors during FETCH phase (not throttling or network errors, which are handled above) + # Examples: JSON parsing failures, unexpected response format, etc. + $unexpectedError = $errorMsg + $unexpectedStack = $_.ScriptStackTrace + + # Increment retry counter and check if retries remain + $fetchErrorRetryCount++ + + if ($fetchErrorRetryCount -lt $maxFetchErrorRetries) { + # 401/Unauthorized: Wait for main thread token refresh before retrying (same pattern as 403 handler) + if ($unexpectedError -match '401|Unauthorized') { + $tokenRefreshed = Wait-ForTokenRefresh + if ($tokenRefreshed) { + $fetchErrorRetryCount = 0 # Reset retries — fresh token deserves fresh attempts + Write-Output "[FETCH-RETRY] Partition $idx/$tot - 401 detected, token refreshed, resetting retry counter" + continue # Retry immediately with fresh token + } + } + + # Retries remain - log and retry the same page + Write-Output "[FETCH-RETRY] Partition $idx/$tot - Unexpected error ($fetchErrorRetryCount/$maxFetchErrorRetries) - Retrying in 30s: $unexpectedError" + try { + $logMsg = "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] [FETCH-RETRY] Partition $idx/$tot (Query $queryId) - Retry $fetchErrorRetryCount/$maxFetchErrorRetries for: $unexpectedError" + $logMsg | Add-Content -Path $using:LogFile -Encoding UTF8 -ErrorAction SilentlyContinue + } catch {} + Start-Sleep -Seconds 30 + continue # Retry pagination loop with same $recordsUri + } + + # All retries exhausted - fail the partition + $unexpectedProcessingError = $true + $unexpectedProcessingMessage = $unexpectedError + + # Output error message (will be deduplicated by parent) + Write-Output "[ERROR] Partition $idx/$tot - Unexpected error during record processing after $fetchErrorRetryCount retries - will retry at end of run" + + # Full error details to ERROR stream (will be captured by main thread) + Write-Error "[ERROR] Partition $idx/$tot (Query $queryId) - Unexpected record processing error: $unexpectedError`n Stack: $unexpectedStack" -ErrorAction Continue + # Thread-safe file logging + try { + $logMsg = "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] [ERROR] Partition $idx/$tot (Query $queryId) - Unexpected record processing error after $fetchErrorRetryCount retries: $unexpectedError" + $logMsg | Add-Content -Path $using:LogFile -Encoding UTF8 -ErrorAction SilentlyContinue + " Stack trace: $unexpectedStack" | Add-Content -Path $using:LogFile -Encoding UTF8 -ErrorAction SilentlyContinue + } catch { + # Ignore logging errors in job + } + + # Break pagination loop - retries exhausted + break + } + $t1 = Get-Date + + # Finalize telemetry + $telemetry.RowCount = $pageFlushTotalCount + $allRecords.Count + + # POST-FETCH 10K LIMIT DETECTION: Only applies to EOM mode (resultSize > 0) + if ($resultSize -gt 0 -and ($pageFlushTotalCount + $allRecords.Count) -eq 10000) { + $partitionHours = ($pEnd - $pStart).TotalHours + $minSubdivisionDays = 0.001389 # 2 minutes + $minSubdivisionHours = $minSubdivisionDays * 24 + + if ($partitionHours -gt $minSubdivisionHours) { + # Partition can be subdivided - flag for subdivision + $telemetry.PostFetch10KLimit = $true + $telemetry.SubdivisionReason = "postfetch_10k_limit" + $script:Hit10KLimit = $true + + Write-Host "[SUBDIVISION] Partition $idx/$tot - Fetched exactly 10,000 records (EOM limit reached) - Needs subdivision ($([Math]::Round($partitionHours,2))h window)" -ForegroundColor Yellow + + # Return subdivision signal + return [PSCustomObject]@{ + QueryId = $queryId + Status = 'needs_subdivision' + PreviewCount = 10000 + PartitionStart = $pStart + PartitionEnd = $pEnd + PartitionIndex = $idx + PartitionTotal = $tot + RetrievedCount = 10000 + Telemetry = $telemetry + } + } else { + Write-Host "[LIMIT] Partition $idx/$tot - Fetched 10,000 records at minimum subdivision window ($([Math]::Round($partitionHours,2))h, cannot subdivide further)" -ForegroundColor Yellow + $script:Hit10KLimit = $true + } + } + # POST-FETCH 1M LIMIT DETECTION: Graph API has 1,000,000 record limit per query + elseif ($resultSize -eq 0 -and ($pageFlushTotalCount + $allRecords.Count) -ge 1000000) { + $partitionHours = ($pEnd - $pStart).TotalHours + $minSubdivisionDays = 0.001389 # 2 minutes + $minSubdivisionHours = $minSubdivisionDays * 24 + + if ($partitionHours -gt $minSubdivisionHours) { + # Partition can be subdivided - flag for subdivision + $telemetry.PostFetch1MLimit = $true + $telemetry.SubdivisionReason = "postfetch_1m_limit" + $script:Hit1MLimit = $true + + Write-Host "[SUBDIVISION] Partition $idx/$tot - Fetched 1,000,000 records (Graph API limit reached) - Needs subdivision ($([Math]::Round($partitionHours,2))h window)" -ForegroundColor Yellow + + # Return subdivision signal + return [PSCustomObject]@{ + QueryId = $queryId + Status = 'needs_subdivision' + PreviewCount = 1000000 + PartitionStart = $pStart + PartitionEnd = $pEnd + PartitionIndex = $idx + PartitionTotal = $tot + RetrievedCount = 1000000 + Telemetry = $telemetry + } + } else { + Write-Host "[LIMIT] Partition $idx/$tot - Fetched 1,000,000 records at minimum subdivision window ($([Math]::Round($partitionHours,2))h, cannot subdivide further)" -ForegroundColor Yellow + $script:Hit1MLimit = $true + } + } + + if ($unexpectedProcessingError) { + $telemetry.Status = 'failed' + # Clean up Purview query to free server slot after all fetch retries exhausted + if ($queryId) { + try { + Invoke-RestMethod -Method DELETE -Uri (Get-AuditUri -path "queries/$queryId") ` + -Headers (Get-CurrentHeaders -ClientRequestId $clientRequestId) -ErrorAction SilentlyContinue | Out-Null + Write-Output "[CLEANUP] Partition $idx/$tot - Deleted query $queryId after $fetchErrorRetryCount failed fetch retries" + } catch { + # Silently continue - cleanup failure shouldn't block retry + } + } + throw [System.Exception]::new("Unexpected record processing error: $unexpectedProcessingMessage") + } + + # Thread-side final persistence (memory-flush mode): write final partition JSONL before returning + if ($memoryFlushEnabled -and $incrementalDir -and $allRecords.Count -gt 0) { + try { + if (-not (Test-Path $incrementalDir)) { + New-Item -ItemType Directory -Path $incrementalDir -Force | Out-Null + } + $threadSavedFile = Join-Path $incrementalDir "Part${idx}_${runTimestamp}_qid-${queryId}_$($allRecords.Count)records.jsonl" + $allRecords | ForEach-Object { $_ | ConvertTo-Json -Depth 10 -Compress } | Out-File -FilePath $threadSavedFile -Encoding utf8 -Force + $threadSavedToDisk = $true + Write-Output "[SAVE-THREAD] Partition $($idx)/$($tot): $($allRecords.Count) records persisted by thread job" + } catch { + Write-Output "[SAVE-THREAD-WARN] Partition $($idx)/$($tot): final thread persistence failed: $($_.Exception.Message)" + } + } elseif ($memoryFlushEnabled -and $pageFlushFilePath) { + # All records were already flushed per-page; just ensure thread-saved references are set + $threadSavedToDisk = $true + $threadSavedFile = $pageFlushFilePath + } + + # Emit success notification for outer monitor (display handled once outside the job) + Write-Output "[SUCCESS] Query succeeded - Partition $idx/$tot - Query ID: $queryId - Retrieved $($pageFlushTotalCount + $allRecords.Count) records" + + # Clean up query after successful record retrieval (best-effort) + if ($queryId) { + try { + Invoke-RestMethod -Method DELETE -Uri (Get-AuditUri -path "queries/$queryId") ` + -Headers (Get-CurrentHeaders -ClientRequestId $clientRequestId) -ErrorAction SilentlyContinue | Out-Null + } catch { + # Silently continue - cleanup failure shouldn't block results + } + } + + # Return results with telemetry + $returnLogs = if ($memoryFlushEnabled -and $threadSavedToDisk) { @() } else { $allRecords } + [pscustomobject]@{ + Activity = $activity + Logs = $returnLogs + RetrievedCount = $pageFlushTotalCount + $allRecords.Count + ElapsedMs = [int]($t1 - $t0).TotalMilliseconds + Partition = $idx + Total = $tot + QueryId = $queryId + DebugInfo = $debugInfo + Telemetry = $telemetry + ThreadSavedToDisk = $threadSavedToDisk + ThreadSavedFile = $threadSavedFile + } + } + } + # Skip parallel execution machinery when sequential mode is active + # The sequential fallback (if -not $canParallel) handles processing below + if (-not $canParallel) { + # Skip to sequential fallback - set flag to bypass the while loop + $allPartitionsProcessed = $true + } + + # Diagnostic: Show concurrency settings before creating jobs (only for parallel mode) + if ($canParallel) { + $diagMsg = "[CONCURRENCY] Partitions=$($partitions.Count) MaxConcurrency=$MaxConcurrency" + Write-LogHost $diagMsg -ForegroundColor Cyan + } + + # Initialize job result tracking + $script:processedJobIds = New-Object System.Collections.Generic.HashSet[int] + + # Pre-compute incremental directory and timestamp for ThreadJob rolling persistence safety net + $threadIncrementalDir = if ($script:PartialOutputPath) { Join-Path (Split-Path $script:PartialOutputPath -Parent) ".pax_incremental" } else { $null } + $threadRunTimestamp = $global:ScriptRunTimestamp + $threadMemoryFlushEnabled = [bool]$script:memoryFlushEnabled + + # Outer loop: Continue creating jobs until all partitions (including subdivided ones) are processed + # This handles dynamic subdivision where new partitions are added during execution + $subdivisionPass = 0 + if (-not $allPartitionsProcessed) { $allPartitionsProcessed = $false } + + while (-not $allPartitionsProcessed) { + $subdivisionPass++ + + # Find partitions that need jobs created (haven't been processed yet) + # FIX E: Include 'Failed' status to enable retry of failed partitions + # For Failed partitions, we allow retry even if they're in partitionsWithJobs (their previous job failed) + $pendingPartitions = @($partitions | Where-Object { + $statusObj = $script:partitionStatus[$_.Index] + if (-not $statusObj) { return $false } + if ($statusObj.Status -eq 'NotStarted' -and -not $script:partitionsWithJobs.Contains($_.Index)) { return $true } + if ($statusObj.Status -eq 'Failed') { return $true } # Allow retry of failed partitions + return $false + }) + + if ($pendingPartitions.Count -eq 0) { + # No new partitions to process, we're done + $allPartitionsProcessed = $true + break + } + + if ($subdivisionPass -gt 1) { + Write-LogHost "" -ForegroundColor Yellow + Write-LogHost "=== Subdivision Pass $subdivisionPass ===" -ForegroundColor Yellow + Write-LogHost "Processing $($pendingPartitions.Count) new sub-partitions from previous subdivisions..." -ForegroundColor Yellow + } + + # Create initial jobs for all PENDING partitions with backpressure + foreach ($pt in $pendingPartitions) { + # ============================================================ + # PROACTIVE TOKEN REFRESH: Check before each job creation + # Job launch phase can take 20-35+ minutes with 60 partitions + # Token may expire during this phase if not refreshed + # ============================================================ + $refreshResult = Refresh-GraphTokenIfNeeded -BufferMinutes 5 + # CRITICAL: Use -is [string] check to avoid PowerShell coercion bug where $true -eq 'Quit' returns True + if ($refreshResult -is [string] -and $refreshResult -eq 'Quit') { + # User chose to quit at auth prompt - save checkpoint and exit gracefully + if ($script:CheckpointEnabled) { Save-Checkpoint -Force } + Show-CheckpointExitMessage + exit 0 + } + + # ============================================================ + # CHECKPOINT: Reactive token refresh for auth failures (401s) + # This ALWAYS runs when AuthFailureDetected is true (not gated by CheckpointEnabled) + # AppRegistration: automatic silent refresh (headless) + # Interactive modes: user can wait at R/Q prompt indefinitely + # ============================================================ + if (Test-ShouldPromptTokenRefresh) { + # AppRegistration: Try automatic silent refresh first + if ($script:AuthConfig.Method -eq 'AppRegistration' -and $script:AuthConfig.CanReauthenticate) { + $refreshResult = Invoke-TokenRefresh -Force + if ($refreshResult.Success -and $refreshResult.NewToken) { + $script:AuthFailureDetected = $false + $script:Auth401MessageShown = $false + Write-LogHost " [AUTH] Token refreshed automatically (AppRegistration)" -ForegroundColor Green + } elseif ($Force) { + # -Force mode: FATAL exit (true headless operation) + Write-LogHost " [AUTH] FATAL: AppRegistration token refresh failed (-Force mode)" -ForegroundColor Red + if ($script:CheckpointEnabled) { Save-Checkpoint -Force } + Show-CheckpointExitMessage + exit 0 + } else { + # No -Force: fall back to interactive prompt + Write-LogHost " [AUTH] Silent refresh failed - falling back to interactive prompt" -ForegroundColor Yellow + $refreshResult = Invoke-TokenRefreshPrompt + if ($refreshResult -eq 'Quit') { + if ($script:CheckpointEnabled) { Save-Checkpoint -Force } + Show-CheckpointExitMessage + exit 0 + } + } + } else { + # Interactive modes: prompt user + $refreshResult = Invoke-TokenRefreshPrompt + if ($refreshResult -eq 'Quit') { + # User chose to quit - save checkpoint if enabled and exit + if ($script:CheckpointEnabled) { Save-Checkpoint -Force } + Show-CheckpointExitMessage + exit 0 + } + } + # Proceed with fresh token + } + + # Backpressure: Wait for a slot if we've reached MaxConcurrency + # Count only non-completed jobs (Running or NotStarted) + $activeJobs = @($jobs | Where-Object { $_.State -in 'Running','NotStarted' }) + while ($activeJobs.Count -ge $MaxConcurrency) { + Write-Verbose "[BACKPRESSURE] Waiting for job slot (active: $($activeJobs.Count)/$MaxConcurrency)..." -Verbose:$VerbosePreference + + # PROACTIVE TOKEN REFRESH: Check while waiting for job slots + $refreshResult = Refresh-GraphTokenIfNeeded -BufferMinutes 5 + # CRITICAL: Use -is [string] check to avoid PowerShell coercion bug where $true -eq 'Quit' returns True + if ($refreshResult -is [string] -and $refreshResult -eq 'Quit') { + # User chose to quit at auth prompt - save checkpoint and exit gracefully + if ($script:CheckpointEnabled) { Save-Checkpoint -Force } + Show-CheckpointExitMessage + exit 0 + } + + # Track when backpressure started (for server-side slot exhaustion detection) + if (-not $script:backpressureStartTime) { $script:backpressureStartTime = Get-Date } + + # STATUS UPDATE: Show periodic status during backpressure wait + if (-not $script:lastBackpressureStatus) { $script:lastBackpressureStatus = Get-Date } + $backpressureElapsed = ((Get-Date) - $script:lastBackpressureStatus).TotalSeconds + if ($backpressureElapsed -ge $StatusIntervalSeconds) { + $completedCount = @($jobs | Where-Object { $_.State -eq 'Completed' }).Count + $runningCount = @($jobs | Where-Object { $_.State -eq 'Running' }).Count + # Count partitions that have actually sent queries (have QueryId) + $sentToServerCount = @($script:partitionStatus.Values | Where-Object { $_.QueryId }).Count + $ts = Get-Date -Format 'HH:mm:ss' + Write-LogHost "[STATUS] [$ts] Partitions (Queries): $runningCount active | $sentToServerCount sent | $($completedCount + $script:resumeCompletedOffset)/$totalPartitions complete" -ForegroundColor Yellow + $script:lastBackpressureStatus = Get-Date + } + + # SERVER-SIDE SLOT EXHAUSTION DETECTION: Check if we're stuck because Purview has existing queries + # Only show once, after 3 minutes of backpressure, if SENT count < MaxConcurrency + if (-not $script:serverSlotWarningShown) { + $backpressureDuration = ((Get-Date) - $script:backpressureStartTime).TotalMinutes + $sentToServerCount = @($script:partitionStatus.Values | Where-Object { $_.QueryId }).Count + if ($backpressureDuration -ge 3 -and $sentToServerCount -lt $MaxConcurrency) { + $missingSlots = $MaxConcurrency - $sentToServerCount + $ts = Get-Date -Format 'HH:mm:ss' + Write-LogHost "" -ForegroundColor Cyan + Write-LogHost "[$ts] [INFO] Server-side query limit may be reached. Only $sentToServerCount of $MaxConcurrency queries sent to Purview after 3+ minutes." -ForegroundColor Cyan + Write-LogHost " This may indicate $missingSlots existing query/queries (from previous runs or the Purview portal) are consuming server slots." -ForegroundColor Cyan + Write-LogHost " Check Purview portal -> Audit -> search jobs for stuck/running queries." -ForegroundColor Cyan + Write-LogHost " You can cancel previous jobs if they aren't needed for anything else or for anyone else." -ForegroundColor Cyan + Write-LogHost "" -ForegroundColor Cyan + $script:serverSlotWarningShown = $true + } + } + + # REACTIVE AUTH CHECK: Handle 401 errors during backpressure wait + # AppRegistration: automatic silent refresh (headless) + # Interactive modes: prompt user for re-authentication + if ($script:AuthFailureDetected) { + Write-LogHost "" -ForegroundColor Red + Write-LogHost " [AUTH] 401 detected during job launch - initiating token refresh..." -ForegroundColor Red + + # AppRegistration: Use automatic silent refresh (no user interaction) + if ($script:AuthConfig.Method -eq 'AppRegistration' -and $script:AuthConfig.CanReauthenticate) { + $refreshResult = Invoke-TokenRefresh -Force + if ($refreshResult.Success -and $refreshResult.NewToken) { + $script:AuthFailureDetected = $false + $script:Auth401MessageShown = $false + # Update shared auth state for thread jobs + $script:SharedAuthState.Token = $refreshResult.NewToken + $script:SharedAuthState.ExpiresOn = (Get-Date).ToUniversalTime().AddMinutes(50) + $script:SharedAuthState.LastRefresh = Get-Date + Write-LogHost " [AUTH] Token refreshed automatically (AppRegistration)" -ForegroundColor Green + } elseif ($Force) { + # -Force mode: FATAL exit (true headless operation) + Write-LogHost " [AUTH] FATAL: AppRegistration token refresh failed (-Force mode)" -ForegroundColor Red + if ($script:CheckpointEnabled) { Save-Checkpoint -Force } + Write-LogHost " Exiting due to authentication failure. Use -Resume to continue later." -ForegroundColor Yellow + return + } else { + # No -Force: fall back to interactive prompt + Write-LogHost " [AUTH] Silent refresh failed - falling back to interactive prompt" -ForegroundColor Yellow + $refreshResult = Invoke-TokenRefreshPrompt + if ($refreshResult -eq 'Quit') { + if ($script:CheckpointEnabled) { Save-Checkpoint -Force } + Write-LogHost " Exiting due to user request. Use -Resume to continue later." -ForegroundColor Yellow + return + } + # Update shared auth state for thread jobs + $tokenInfo = Get-GraphAccessTokenWithExpiry + if ($tokenInfo) { + $script:SharedAuthState.Token = $tokenInfo.Token + $script:SharedAuthState.ExpiresOn = $tokenInfo.ExpiresOn + $script:SharedAuthState.LastRefresh = Get-Date + } + } + } else { + # Interactive modes: prompt user + $refreshResult = Invoke-TokenRefreshPrompt + if ($refreshResult -eq 'Quit') { + if ($script:CheckpointEnabled) { Save-Checkpoint -Force } + Write-LogHost " Exiting due to user request. Use -Resume to continue later." -ForegroundColor Yellow + return + } + # Update shared auth state for thread jobs + $tokenInfo = Get-GraphAccessTokenWithExpiry + if ($tokenInfo) { + $script:SharedAuthState.Token = $tokenInfo.Token + $script:SharedAuthState.ExpiresOn = $tokenInfo.ExpiresOn + $script:SharedAuthState.LastRefresh = Get-Date + } + } + Write-LogHost " [AUTH] Token refreshed - resuming job launch" -ForegroundColor Green + } + + Start-Sleep -Milliseconds 500 + + # Collect output from active jobs while waiting (with error detection) + # FIX: Process ALL jobs including completed ones to ensure JSONL gets saved + foreach ($activeJob in $activeJobs) { + if ($script:AuthFailureDetected) { break } # IMMEDIATE EXIT on auth failure + # REMOVED: if ($activeJob.State -eq 'Completed') { continue } + # Completed jobs MUST be processed to get their result objects for JSONL save + try { + # NO -Keep: process result objects fully here (JSONL save, checkpoint) + $waitOutput = Receive-Job -Job $activeJob -ErrorAction SilentlyContinue -ErrorVariable backpressureJobErrors + + # Check for 401 errors in job output + if ($backpressureJobErrors) { + foreach ($err in $backpressureJobErrors) { + $errMsg = if ($err.Exception) { $err.Exception.Message } else { $err.ToString() } + if ($errMsg -match '401|Unauthorized|token.*expired') { + $script:AuthFailureDetected = $true + if (-not $script:Auth401MessageShown) { + $script:Auth401MessageShown = $true + Write-LogHost " [AUTH] 401 Unauthorized detected - stopping to re-authenticate" -ForegroundColor Red + } + break # IMMEDIATE EXIT + } + } + } + if ($script:AuthFailureDetected) { break } # Exit if flag was set + if ($waitOutput) { + foreach ($output in $waitOutput) { + if ($script:AuthFailureDetected) { break } # IMMEDIATE EXIT + if ($output -is [string]) { + $msgKey = "$($activeJob.Id):$output" + $color = $null + if ($output -match '^\[ATTEMPT\]') { + $msgKey = "$($activeJob.Id):ATTEMPT" + $color = 'DarkGray' + } + elseif ($output -match '^\[SENT\]') { + $color = 'DarkGray' + # CRITICAL: Extract and store QueryId from SENT message for retry reuse + if ($output -match 'QueryId:\s*([a-f0-9-]+)\)') { + $extractedQueryId = $matches[1] + $jobPartition = $jobMeta[$activeJob.Id] + if ($jobPartition -and $script:partitionStatus.ContainsKey($jobPartition.Index)) { + $script:partitionStatus[$jobPartition.Index].QueryId = $extractedQueryId + + # CHECKPOINT: Save QueryCreated state so we can resume data fetch if interrupted + if ($script:CheckpointEnabled) { + Save-Checkpoint -PartitionIndex $jobPartition.Index -QueryId $extractedQueryId -State 'QueryCreated' + } + } + } + } + elseif ($output -match '^\[ERROR\]') { + $color = 'Red' + # Check for 401 in error messages from thread jobs + if ($output -match '401|Unauthorized') { + $script:AuthFailureDetected = $true + if (-not $script:Auth401MessageShown) { + $script:Auth401MessageShown = $true + Write-LogHost " [AUTH] 401 Unauthorized detected in job output" -ForegroundColor Red + } + # CRITICAL: Mark this partition as Failed for retry after re-auth + $jobPartition = $jobMeta[$activeJob.Id] + if ($jobPartition -and $script:partitionStatus.ContainsKey($jobPartition.Index)) { + $script:partitionStatus[$jobPartition.Index].Status = 'Failed' + $script:partitionStatus[$jobPartition.Index].LastError = '401 Unauthorized - token expired' + } + break # IMMEDIATE EXIT - don't process more output + } else { + # FIX: Non-401 [ERROR] messages (e.g., "Unexpected error during record processing") + # must also mark the partition as Failed for retry + $jobPartition = $jobMeta[$activeJob.Id] + if ($jobPartition -and $script:partitionStatus.ContainsKey($jobPartition.Index)) { + $script:partitionStatus[$jobPartition.Index].Status = 'Failed' + $script:partitionStatus[$jobPartition.Index].LastError = $output + Write-LogHost " [RETRY-QUEUE] Partition $($jobPartition.Index)/$($jobPartition.Total) queued for retry at end of run" -ForegroundColor Yellow + } + } + } + elseif ($output -match '^\[403-(CREATE|POLL|FETCH)\]') { + # Transient 403 retry messages - dedupe by partition+attempt + $msgKey = "$($activeJob.Id):403:$output" + $color = 'Magenta' + } + elseif ($output -match '^\[403-(PERM|MAX)\]') { + # Permanent/max retry 403 messages + $msgKey = "$($activeJob.Id):403:$output" + $color = 'Red' + } + elseif ($output -match '^\[STATUS\] Query running') { + $msgKey = "$($activeJob.Id):STATUS" + $color = 'Yellow' + } + elseif ($output -match '^\[SUCCESS\]') { + $msgKey = "$($activeJob.Id):SUCCESS" + $color = 'Green' + } + if ($color -and -not $script:shownJobMessages.ContainsKey($msgKey)) { + $message = if ($output -match '^\[(STATUS|SUCCESS)\]\s*') { + $output -replace '^\[STATUS\]\s*','' -replace '^\[SUCCESS\]\s*','' + } else { + $output + } + Write-LogHost $message -ForegroundColor $color + $script:shownJobMessages[$msgKey] = $true + } + } + elseif ($output -isnot [string] -and -not $script:processedJobIds.Contains($activeJob.Id)) { + # DEFENSIVE: Detect SplitRequired objects (should never arrive now, but guard against it) + if ($output.SplitRequired -eq $true) { + $jobPartition = $jobMeta[$activeJob.Id] + if ($jobPartition -and $script:partitionStatus.ContainsKey($jobPartition.Index)) { + $script:partitionStatus[$jobPartition.Index].Status = 'Failed' + $script:partitionStatus[$jobPartition.Index].LastError = "Query still running after $([Math]::Round($output.ElapsedMinutes,1)) min - queued for retry" + Write-LogHost " [RETRY-QUEUE] Partition $($jobPartition.Index)/$($jobPartition.Total) - query still processing after $([Math]::Round($output.ElapsedMinutes,1)) min, queued for retry" -ForegroundColor Yellow + } + [void]$script:processedJobIds.Add($activeJob.Id) + continue + } + # FULL RESULT PROCESSING: Handle result objects immediately to ensure JSONL save + $jobPartition = $jobMeta[$activeJob.Id] + if ($jobPartition -and $script:partitionStatus.ContainsKey($jobPartition.Index)) { + $currentStatus = $script:partitionStatus[$jobPartition.Index].Status + if ($currentStatus -ne 'Complete') { + if ([string]::IsNullOrWhiteSpace([string]$output.QueryId) -and [int]($output.RetrievedCount ?? 0) -le 0) { + $script:partitionStatus[$jobPartition.Index].Status = 'Failed' + $script:partitionStatus[$jobPartition.Index].LastError = 'ThreadJob returned empty QueryId with zero records (query was not created/sent)' + Write-LogHost " [DATA-CHECK] Partition $($jobPartition.Index) returned empty QueryId with 0 records - marked Failed for retry" -ForegroundColor Yellow + [void]$script:processedJobIds.Add($activeJob.Id) + continue + } + + $script:partitionStatus[$jobPartition.Index].Status = 'Complete' + $script:partitionStatus[$jobPartition.Index].QueryId = $output.QueryId + $script:partitionStatus[$jobPartition.Index].RecordCount = ($output.RetrievedCount ?? 0) + + # Fallback: emit SUCCESS message if the original [SUCCESS] string was already consumed + $successKey = "$($activeJob.Id):SUCCESS" + if (-not $script:shownJobMessages.ContainsKey($successKey)) { + $script:shownJobMessages[$successKey] = $true + Write-LogHost "Query succeeded - Partition $($jobPartition.Index)/$($jobPartition.Total) - Query ID: $($output.QueryId) - Retrieved $($output.RetrievedCount ?? 0) records" -ForegroundColor Green + } + + if ($output.ThreadSavedToDisk -and $script:memoryFlushEnabled -and -not $script:memoryFlushed) { + $script:memoryFlushed = $true + Write-LogHost " [MEMORY] Thread-side persistence active - streaming export will use JSONL files" -ForegroundColor Yellow + } + + # Add logs to collection (skip when memory flush enabled - data goes to JSONL only) + if ($output.Logs -and $output.Logs.Count -gt 0 -and -not $script:memoryFlushEnabled) { + $allLogs.AddRange($output.Logs) + } + + # Update metrics + try { + $script:metrics.QueryMs += [int]$output.ElapsedMs + $script:metrics.TotalRecordsFetched += [int]$output.RetrievedCount + } catch {} + + # Save checkpoint + if ($script:CheckpointEnabled) { + Save-Checkpoint -PartitionIndex $jobPartition.Index -QueryId $output.QueryId -State 'Completed' -RecordCount $output.RetrievedCount + } + + # INCREMENTAL SAVE: Write JSONL immediately + if ($output.Logs -and $output.Logs.Count -gt 0) { + try { + $incrementalDir = Join-Path (Split-Path $script:PartialOutputPath -Parent) ".pax_incremental" + if (-not (Test-Path $incrementalDir)) { + New-Item -ItemType Directory -Path $incrementalDir -Force | Out-Null + } + $incrementalFile = Join-Path $incrementalDir "Part$($jobPartition.Index)_${global:ScriptRunTimestamp}_qid-$($output.QueryId)_$($output.RetrievedCount)records.jsonl" + $output.Logs | ForEach-Object { $_ | ConvertTo-Json -Depth 10 -Compress } | Out-File -FilePath $incrementalFile -Encoding utf8 -Force + Write-LogHost " [SAVE] Partition $($jobPartition.Index): $($output.RetrievedCount) records saved to disk" -ForegroundColor DarkGreen + + # Release source reference to allow GC to reclaim memory + $output.Logs = $null + + # MEMORY MANAGEMENT: When enabled, we skip AddRange and use JSONL-only path + # Set memoryFlushed flag to signal streaming export + if ($script:memoryFlushEnabled -and -not $script:memoryFlushed) { + $script:memoryFlushed = $true + Write-LogHost " [MEMORY] Memory management active - data written to JSONL only (limit: $($script:ResolvedMaxMemoryMB)MB)" -ForegroundColor Yellow + } + } catch { + Write-LogHost " [WARN] Failed to save incremental data for Partition $($jobPartition.Index): $($_.Exception.Message)" -ForegroundColor Yellow + } + } + + # Mark job as processed + [void]$script:processedJobIds.Add($activeJob.Id) + } + } + } + } + } + } catch {} + } + + # Recount active jobs + $activeJobs = @($jobs | Where-Object { $_.State -in 'Running','NotStarted' }) + } + + # Create the job now that we have a slot + # Graph API: MaxRecords=0 (unlimited) - 10K limit only applies to EOM mode + $graphResultSize = if ($UseEOM) { $ResultSize } else { 0 } + + # For fetch-only partitions (resume mode), pass stored QueryId to skip query creation + $existingQueryIdForJob = if ($pt.StoredQueryId) { $pt.StoredQueryId } else { $null } + if ($existingQueryIdForJob) { + Write-LogHost "[RESUME] Partition $($pt.Index)/$($pt.Total) - Using stored QueryId: $existingQueryIdForJob" -ForegroundColor Yellow + } + + $job = Start-ThreadJob -ThrottleLimit $maxConcurrentPartitions -ScriptBlock $queryJobScriptBlock -ArgumentList $pt.PStart, $pt.PEnd, $activities, $graphResultSize, $UserIds, $pt.Index, $pt.Total, $script:SharedAuthState, $pt, $MaxNetworkOutageMinutes, $script:GraphAuditApiVersion, $script:LogFile, $existingQueryIdForJob, $threadIncrementalDir, $threadRunTimestamp, $threadMemoryFlushEnabled + $jobs += $job + $jobMeta[$job.Id] = $pt + + # Mark this partition as having a job created for it + [void]$script:partitionsWithJobs.Add($pt.Index) + if ($script:partitionStatus.ContainsKey($pt.Index)) { + $script:partitionStatus[$pt.Index].Status = 'JobCreated' + } + + # Log each job creation to terminal + $createdTimestamp = Get-Date -Format 'HH:mm:ss' + Write-LogHost "[CREATED] [$createdTimestamp] Partition $($pt.Index)/$($pt.Total) - Job created" -ForegroundColor DarkGray + + # Poll for output from this job for up to 10 seconds (Purview API can have unpredictable latency) + Start-Sleep -Milliseconds 50 # Small initial delay to let job start + $pollAttempts = 0 + $maxPollAttempts = 100 # 100 x 100ms = 10 seconds + $gotSentMessage = $false + + while ($pollAttempts -lt $maxPollAttempts -and -not $gotSentMessage) { + Start-Sleep -Milliseconds 100 + $pollAttempts++ + + if ($job.State -eq 'Completed') { break } + + try { + # NO -Keep here: immediate polling is just waiting for [SENT], job hasn't completed yet + # Result objects only appear after job completion, which happens later in main monitoring loop + $immediateOutput = Receive-Job -Job $job -ErrorAction SilentlyContinue + if ($immediateOutput) { + foreach ($output in $immediateOutput) { + if ($output -is [string]) { + $msgKey = "$($job.Id):$output" + if ($output -match '^\[ATTEMPT\]') { + $msgKey = "$($job.Id):ATTEMPT" + } + + if ($output -match '^\[ATTEMPT\]') { + # Always show ATTEMPT messages (retries) + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + Write-LogHost $output -ForegroundColor DarkGray + $script:shownJobMessages[$msgKey] = $true + } + } + elseif ($output -match '^\[SENT\]') { + # Only show SENT once globally + $msgKey = "$($job.Id):$output" + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + Write-LogHost $output -ForegroundColor DarkGray + $script:shownJobMessages[$msgKey] = $true + + # CRITICAL: Extract and store QueryId from SENT message for retry reuse + # Format: [SENT] [HH:mm:ss] Partition X/Y - Query sent to Purview (QueryId: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx) + if ($output -match 'QueryId:\s*([a-f0-9-]+)\)') { + $extractedQueryId = $matches[1] + $jobPartition = $jobMeta[$job.Id] + if ($jobPartition -and $script:partitionStatus.ContainsKey($jobPartition.Index)) { + $script:partitionStatus[$jobPartition.Index].QueryId = $extractedQueryId + Write-Verbose "[QUERYID-CAPTURED] Partition $($jobPartition.Index) QueryId=$extractedQueryId" -Verbose:$VerbosePreference + + # CHECKPOINT: Save QueryCreated state so we can resume data fetch if interrupted + if ($script:CheckpointEnabled) { + Save-Checkpoint -PartitionIndex $jobPartition.Index -QueryId $extractedQueryId -State 'QueryCreated' + } + } + } + } else { + Write-Verbose "DEDUP: Already shown - JobId=$($job.Id), Msg=$output" -Verbose:$VerbosePreference + } + $gotSentMessage = $true + } + elseif ($output -match '^\[ERROR\]') { + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + Write-LogHost $output -ForegroundColor Red + $script:shownJobMessages[$msgKey] = $true + # Check for 401 in error messages + if ($output -match '401|Unauthorized') { + $script:AuthFailureDetected = $true + if (-not $script:Auth401MessageShown) { + $script:Auth401MessageShown = $true + Write-LogHost " [AUTH] 401 Unauthorized detected in job output" -ForegroundColor Red + } + # CRITICAL: Mark this partition as Failed for retry after re-auth + $jobPartition = $jobMeta[$job.Id] + if ($jobPartition -and $script:partitionStatus.ContainsKey($jobPartition.Index)) { + $script:partitionStatus[$jobPartition.Index].Status = 'Failed' + $script:partitionStatus[$jobPartition.Index].LastError = '401 Unauthorized - token expired' + } + break # IMMEDIATE EXIT + } + } + $gotSentMessage = $true # Stop polling on error too + } + elseif ($output -match '^\[403-(CREATE|POLL|FETCH)\]') { + # Transient 403 retry messages - dedupe by partition+attempt + $msgKey = "$($job.Id):403:$output" + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + Write-LogHost $output -ForegroundColor Magenta + $script:shownJobMessages[$msgKey] = $true + } + } + elseif ($output -match '^\[403-(PERM|MAX)\]') { + # Permanent/max retry 403 messages + $msgKey = "$($job.Id):403:$output" + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + Write-LogHost $output -ForegroundColor Red + $script:shownJobMessages[$msgKey] = $true + } + } + elseif ($output -match '^\[STATUS\] Query running') { + $msgKey = "$($job.Id):STATUS" + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + Write-LogHost ($output -replace '^\[STATUS\]\s*','') -ForegroundColor Yellow + $script:shownJobMessages[$msgKey] = $true + } + } + elseif ($output -match '^\[SUCCESS\]') { + $msgKey = "$($job.Id):SUCCESS" + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + Write-LogHost ($output -replace '^\[SUCCESS\]\s*','') -ForegroundColor Green + $script:shownJobMessages[$msgKey] = $true + } + } + } + elseif ($output -isnot [string] -and -not $script:processedJobIds.Contains($job.Id)) { + # FULL RESULT PROCESSING: Handle result objects immediately to ensure JSONL save + $jobPartition = $jobMeta[$job.Id] + if ($jobPartition -and $script:partitionStatus.ContainsKey($jobPartition.Index)) { + $currentStatus = $script:partitionStatus[$jobPartition.Index].Status + if ($currentStatus -ne 'Complete') { + if ([string]::IsNullOrWhiteSpace([string]$output.QueryId) -and [int]($output.RetrievedCount ?? 0) -le 0) { + $script:partitionStatus[$jobPartition.Index].Status = 'Failed' + $script:partitionStatus[$jobPartition.Index].LastError = 'ThreadJob returned empty QueryId with zero records (query was not created/sent)' + Write-LogHost " [DATA-CHECK] Partition $($jobPartition.Index) returned empty QueryId with 0 records - marked Failed for retry" -ForegroundColor Yellow + [void]$script:processedJobIds.Add($job.Id) + continue + } + + $script:partitionStatus[$jobPartition.Index].Status = 'Complete' + $script:partitionStatus[$jobPartition.Index].QueryId = $output.QueryId + $script:partitionStatus[$jobPartition.Index].RecordCount = $output.RetrievedCount + + # Add logs to collection (skip if memory flush enabled - using JSONL only) + if ($output.Logs -and $output.Logs.Count -gt 0 -and -not $script:memoryFlushEnabled) { + $allLogs.AddRange($output.Logs) + } + + # Update metrics + try { + $script:metrics.QueryMs += [int]$output.ElapsedMs + $script:metrics.TotalRecordsFetched += [int]$output.RetrievedCount + } catch {} + + # Save checkpoint + if ($script:CheckpointEnabled) { + Save-Checkpoint -PartitionIndex $jobPartition.Index -QueryId $output.QueryId -State 'Completed' -RecordCount $output.RetrievedCount + } + + # INCREMENTAL SAVE: Write JSONL immediately + if ($output.Logs -and $output.Logs.Count -gt 0) { + try { + $incrementalDir = Join-Path (Split-Path $script:PartialOutputPath -Parent) ".pax_incremental" + if (-not (Test-Path $incrementalDir)) { + New-Item -ItemType Directory -Path $incrementalDir -Force | Out-Null + } + $incrementalFile = Join-Path $incrementalDir "Part$($jobPartition.Index)_${global:ScriptRunTimestamp}_qid-$($output.QueryId)_$($output.RetrievedCount)records.jsonl" + $output.Logs | ForEach-Object { $_ | ConvertTo-Json -Depth 10 -Compress } | Out-File -FilePath $incrementalFile -Encoding utf8 -Force + Write-LogHost " [SAVE] Partition $($jobPartition.Index): $($output.RetrievedCount) records saved to disk" -ForegroundColor DarkGreen + + # Clear reference to allow GC (critical for memory management) + $output.Logs = $null + + # MEMORY MANAGEMENT: Mark if we're using JSONL-only mode + if ($script:memoryFlushEnabled -and -not $script:memoryFlushed) { + $script:memoryFlushed = $true + Write-LogHost " [MEMORY] Memory management active - data written to JSONL only (streaming export at end)" -ForegroundColor Yellow + } + } catch { + Write-LogHost " [WARN] Failed to save incremental data for Partition $($jobPartition.Index): $($_.Exception.Message)" -ForegroundColor Yellow + } + } + + # Mark job as processed + [void]$script:processedJobIds.Add($job.Id) + } + } + } + } + } + } catch {} + } + + Write-Verbose "Created job for partition $($pt.Index)/$($pt.Total) - Job ID: $($job.Id)" -Verbose:$VerbosePreference + + # Start monitoring loop in background once first batch is queued + # Note: Jobs may still be retrying 403s internally - this just means they've been started + if (-not $monitoringStarted -and $jobs.Count -ge $firstBatchSize) { + $monitoringStarted = $true + # Initialize monitoring state + $script:lastStatusUpdate = Get-Date + } # Show status updates while creating jobs (if monitoring started) + if ($monitoringStarted) { + # Collect output from all existing jobs + # FIX: Process ALL jobs including completed ones to ensure JSONL gets saved + foreach ($existingJob in $jobs) { + # REMOVED: if ($existingJob.State -eq 'Completed') { continue } + # Completed jobs MUST be processed to get their result objects for JSONL save + try { + # NO -Keep: process result objects fully here (JSONL save, checkpoint) + $jobOutput = Receive-Job -Job $existingJob -ErrorAction SilentlyContinue + if ($jobOutput) { + foreach ($output in $jobOutput) { + if ($output -is [string]) { + $msgKey = "$($existingJob.Id):$output" + if ($output -match '^\[ATTEMPT\]') { + $msgKey = "$($existingJob.Id):ATTEMPT" + } + + if ($output -match '^\[ATTEMPT\]') { + # Always show ATTEMPT messages (retries) + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + Write-LogHost $output -ForegroundColor DarkGray + $script:shownJobMessages[$msgKey] = $true + } + } + elseif ($output -match '^\[SENT\]') { + # Only show SENT once per job + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + Write-LogHost $output -ForegroundColor DarkGray + $script:shownJobMessages[$msgKey] = $true + } + # CRITICAL: Extract and store QueryId from SENT message for retry reuse + if ($output -match 'QueryId:\s*([a-f0-9-]+)\)') { + $extractedQueryId = $matches[1] + $jobPartition = $jobMeta[$existingJob.Id] + if ($jobPartition -and $script:partitionStatus.ContainsKey($jobPartition.Index)) { + $script:partitionStatus[$jobPartition.Index].QueryId = $extractedQueryId + } + } + } + elseif ($output -match '^\[ERROR\]') { + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + Write-LogHost $output -ForegroundColor Red + $script:shownJobMessages[$msgKey] = $true + # Check for 401 in error messages + if ($output -match '401|Unauthorized') { + $script:AuthFailureDetected = $true + if (-not $script:Auth401MessageShown) { + $script:Auth401MessageShown = $true + Write-LogHost " [AUTH] 401 Unauthorized detected in job output" -ForegroundColor Red + } + # CRITICAL: Mark this partition as Failed for retry after re-auth + $jobPartition = $jobMeta[$existingJob.Id] + if ($jobPartition -and $script:partitionStatus.ContainsKey($jobPartition.Index)) { + $script:partitionStatus[$jobPartition.Index].Status = 'Failed' + $script:partitionStatus[$jobPartition.Index].LastError = '401 Unauthorized - token expired' + } + break # IMMEDIATE EXIT + } + } + } + elseif ($output -match '^\[STATUS\] Query running') { + $msgKey = "$($existingJob.Id):STATUS" + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + Write-LogHost ($output -replace '^\[STATUS\]\s*','') -ForegroundColor Yellow + $script:shownJobMessages[$msgKey] = $true + } + } + elseif ($output -match '^\[PROGRESS\]') { + # Silent: extract page count to update STATUS display, do not log + if ($output -match '\[PROGRESS\] P(\d+)/\d+ pg(\d+)') { + $pIdx = [int]$matches[1]; $pg = [int]$matches[2] + if (-not $script:partitionPageCounts.ContainsKey($pIdx) -or $pg -gt $script:partitionPageCounts[$pIdx]) { + $script:partitionPageCounts[$pIdx] = $pg + } + } + $script:shownJobMessages[$msgKey] = $true + } + elseif ($output -match '^\[NETWORK\]') { + # Also extract page count from NETWORK error messages + if ($output -match 'Partition (\d+)/\d+ Page (\d+)') { + $pIdx = [int]$matches[1]; $pg = [int]$matches[2] + if (-not $script:partitionPageCounts.ContainsKey($pIdx) -or $pg -gt $script:partitionPageCounts[$pIdx]) { + $script:partitionPageCounts[$pIdx] = $pg + } + } + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + Write-LogHost $output -ForegroundColor DarkYellow + $script:shownJobMessages[$msgKey] = $true + } + } + elseif ($output -match '^\[SUCCESS\]') { + $msgKey = "$($existingJob.Id):SUCCESS" + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + Write-LogHost ($output -replace '^\[SUCCESS\]\s*','') -ForegroundColor Green + $script:shownJobMessages[$msgKey] = $true + } + } + } + elseif ($output -isnot [string] -and -not $script:processedJobIds.Contains($existingJob.Id)) { + # DEFENSIVE: Detect SplitRequired objects and treat as Failed for retry + if ($output.SplitRequired -eq $true) { + $jobPartition = $jobMeta[$existingJob.Id] + if ($jobPartition -and $script:partitionStatus.ContainsKey($jobPartition.Index)) { + $script:partitionStatus[$jobPartition.Index].Status = 'Failed' + $script:partitionStatus[$jobPartition.Index].LastError = "Query still running after $([Math]::Round($output.ElapsedMinutes,1)) min - queued for retry" + Write-LogHost " [RETRY-QUEUE] Partition $($jobPartition.Index)/$($jobPartition.Total) - query still processing after $([Math]::Round($output.ElapsedMinutes,1)) min, queued for retry" -ForegroundColor Yellow + } + [void]$script:processedJobIds.Add($existingJob.Id) + continue + } + # FULL RESULT PROCESSING: Handle result objects immediately to ensure JSONL save + $jobPartition = $jobMeta[$existingJob.Id] + if ($jobPartition -and $script:partitionStatus.ContainsKey($jobPartition.Index)) { + $currentStatus = $script:partitionStatus[$jobPartition.Index].Status + if ($currentStatus -ne 'Complete') { + if ([string]::IsNullOrWhiteSpace([string]$output.QueryId) -and [int]($output.RetrievedCount ?? 0) -le 0) { + $script:partitionStatus[$jobPartition.Index].Status = 'Failed' + $script:partitionStatus[$jobPartition.Index].LastError = 'ThreadJob returned empty QueryId with zero records (query was not created/sent)' + Write-LogHost " [DATA-CHECK] Partition $($jobPartition.Index) returned empty QueryId with 0 records - marked Failed for retry" -ForegroundColor Yellow + [void]$script:processedJobIds.Add($existingJob.Id) + continue + } + + $script:partitionStatus[$jobPartition.Index].Status = 'Complete' + $script:partitionStatus[$jobPartition.Index].QueryId = $output.QueryId + $script:partitionStatus[$jobPartition.Index].RecordCount = ($output.RetrievedCount ?? 0) + + # Fallback: emit SUCCESS message if the original [SUCCESS] string was already consumed + $successKey = "$($existingJob.Id):SUCCESS" + if (-not $script:shownJobMessages.ContainsKey($successKey)) { + $script:shownJobMessages[$successKey] = $true + Write-LogHost "Query succeeded - Partition $($jobPartition.Index)/$($jobPartition.Total) - Query ID: $($output.QueryId) - Retrieved $($output.RetrievedCount ?? 0) records" -ForegroundColor Green + } + + if ($output.ThreadSavedToDisk -and $script:memoryFlushEnabled -and -not $script:memoryFlushed) { + $script:memoryFlushed = $true + Write-LogHost " [MEMORY] Thread-side persistence active - streaming export will use JSONL files" -ForegroundColor Yellow + } + + # Add logs to collection (skip if memory flush enabled - using JSONL only) + if ($output.Logs -and $output.Logs.Count -gt 0 -and -not $script:memoryFlushEnabled) { + $allLogs.AddRange($output.Logs) + } + + # Update metrics + try { + $script:metrics.QueryMs += [int]$output.ElapsedMs + $script:metrics.TotalRecordsFetched += [int]$output.RetrievedCount + } catch {} + + # Save checkpoint + if ($script:CheckpointEnabled) { + Save-Checkpoint -PartitionIndex $jobPartition.Index -QueryId $output.QueryId -State 'Completed' -RecordCount $output.RetrievedCount + } + + # INCREMENTAL SAVE: Write JSONL immediately + if ($output.Logs -and $output.Logs.Count -gt 0) { + try { + $incrementalDir = Join-Path (Split-Path $script:PartialOutputPath -Parent) ".pax_incremental" + if (-not (Test-Path $incrementalDir)) { + New-Item -ItemType Directory -Path $incrementalDir -Force | Out-Null + } + $incrementalFile = Join-Path $incrementalDir "Part$($jobPartition.Index)_${global:ScriptRunTimestamp}_qid-$($output.QueryId)_$($output.RetrievedCount)records.jsonl" + $output.Logs | ForEach-Object { $_ | ConvertTo-Json -Depth 10 -Compress } | Out-File -FilePath $incrementalFile -Encoding utf8 -Force + Write-LogHost " [SAVE] Partition $($jobPartition.Index): $($output.RetrievedCount) records saved to disk" -ForegroundColor DarkGreen + + # Clear reference to allow GC (critical for memory management) + $output.Logs = $null + + # MEMORY MANAGEMENT: Mark if we're using JSONL-only mode + if ($script:memoryFlushEnabled -and -not $script:memoryFlushed) { + $script:memoryFlushed = $true + Write-LogHost " [MEMORY] Memory management active - data written to JSONL only (streaming export at end)" -ForegroundColor Yellow + } + } catch { + Write-LogHost " [WARN] Failed to save incremental data for Partition $($jobPartition.Index): $($_.Exception.Message)" -ForegroundColor Yellow + } + } + + # Mark job as processed + [void]$script:processedJobIds.Add($existingJob.Id) + } + } + } + } + } + } catch { + # Silently continue if we can't receive from a job + } + } + + $elapsedSinceLastUpdate = ((Get-Date) - $script:lastStatusUpdate).TotalSeconds + if ($elapsedSinceLastUpdate -ge $StatusIntervalSeconds) { + # Count partitions by status (EXCLUDE 'Subdivided' parent partitions from total) + $activeStatuses = $script:partitionStatus.Values | Where-Object { $_.Status -ne 'Subdivided' } + $completedPartitions = @($activeStatuses | Where-Object { $_.Status -eq 'Complete' }).Count + $jobCreatedPartitions = @($activeStatuses | Where-Object { $_.Status -eq 'JobCreated' }).Count + $notStartedPartitions = @($activeStatuses | Where-Object { $_.Status -eq 'NotStarted' }).Count + + # Calculate remaining and total (only active partitions) + $remainingToComplete = $jobCreatedPartitions + $notStartedPartitions + $totalPartitions = $activeStatuses.Count + + $statusLine = "[$(Get-Date -Format 'HH:mm:ss')] Total Queries: $($totalPartitions + $script:resumeCompletedOffset) | Completed: $($completedPartitions + $script:resumeCompletedOffset) | Remaining: $remainingToComplete" + if ($script:partitionPageCounts -and $script:partitionPageCounts.Count -gt 0) { + $pageStr = ($script:partitionPageCounts.GetEnumerator() | Sort-Object Key | ForEach-Object { "P$($_.Key):$($_.Value.ToString('N0'))pg" }) -join ' ' + $statusLine += " | Pages $pageStr" + } + Write-LogHost $statusLine -ForegroundColor White + + $script:lastStatusUpdate = Get-Date + } + } # Staggered launch with 10-25s jitter to prevent API burst (except for last job) + if ($pt.Index -lt $pt.Total) { + $staggerDelay = Get-Random -Minimum 10 -Maximum 25 + Start-Sleep -Seconds $staggerDelay + } + } + + # All ThreadJobs now launched (note: this only means threads are running, not that queries were created on server) + $launchElapsed = [Math]::Round(((Get-Date) - $launchStartTime).TotalSeconds, 1) + Write-LogHost " All $($jobs.Count) ThreadJobs launched (${launchElapsed}s), monitoring query creation..." -ForegroundColor DarkCyan + + + # If monitoring wasn't started earlier (only happens if $partitions.Count < $firstBatchSize) + if (-not $monitoringStarted) { + $script:throttleNotifications = [System.Collections.Concurrent.ConcurrentQueue[PSObject]]::new() + } # Wait for all jobs to complete and display monitoring output + $initialBlockSize = if ($script:globalLearnedBlockSize -and $script:globalLearnedBlockSize -gt 0) { $script:globalLearnedBlockSize } else { $BlockHours } + if ($initialBlockSize -le 0) { $initialBlockSize = 0.5 } + + # Simple polling to monitor job completion + $lastStatusUpdate = Get-Date + $firstStatus = $true + + while (($jobs | Where-Object { $_.State -in 'Running','NotStarted' }).Count -gt 0) { + # Continuously collect output from all jobs (running and completed) + foreach ($job in $jobs) { + # Skip jobs we've already processed (Receive-Job can only be called once) + if ($script:processedJobIds.Contains($job.Id)) { + continue + } + + try { + $jobOutput = Receive-Job -Job $job -Keep -ErrorAction SilentlyContinue -ErrorVariable jobErrors + + # SCALING FIX: Receive-Job -Keep re-delivers ALL historical output every loop iteration. + # After hundreds/thousands of pages, iterating all messages on every 500ms cycle causes + # progressive slowdown (STATUS gaps widen from 60s to 10+ minutes after many hours). + # Track per-job offset so we only process truly new messages each cycle. + $rawCount = if ($jobOutput) { @($jobOutput).Count } else { 0 } + $seenCount = if ($script:jobOutputOffset.ContainsKey($job.Id)) { $script:jobOutputOffset[$job.Id] } else { 0 } + $script:jobOutputOffset[$job.Id] = $rawCount + $jobOutput = if ($rawCount -gt $seenCount) { @($jobOutput)[$seenCount..($rawCount - 1)] } else { $null } + + # Capture and log job errors to file (only once per job) + if ($jobErrors -and $jobErrors.Count -gt 0) { + # Check if we've already logged errors for this job + $errorLogKey = "ERRORS:$($job.Id)" + if (-not $script:shownJobMessages.ContainsKey($errorLogKey)) { + $script:shownJobMessages[$errorLogKey] = $true + foreach ($err in $jobErrors) { + $errMsg = if ($err.Exception) { $err.Exception.Message } else { $err.ToString() } + Write-Log "Job $($job.Id) error: $errMsg" -Level "ERROR" + + # DIFFERENTIATE 401 vs 403 ERRORS + # 401 = Token expired/invalid → refresh will help + # 403 = Permission denied → refresh will NOT help + if ($errMsg -match '403|Forbidden|Access.*denied|Insufficient.*privileges') { + # 403 Forbidden - permissions issue, NOT token expiration + # Don't set AuthFailureDetected - refresh won't help + Write-LogHost " [AUTH] 403 Forbidden detected - this is a PERMISSIONS issue, not token expiration" -ForegroundColor Red + Write-LogHost " [AUTH] Token refresh will NOT resolve this. Check:" -ForegroundColor Yellow + Write-LogHost " • AuditLog.Read.All scope is granted" -ForegroundColor Yellow + Write-LogHost " • Admin consent has been provided" -ForegroundColor Yellow + Write-LogHost " • Required Azure AD role is assigned" -ForegroundColor Yellow + } + elseif ($errMsg -match '401|Unauthorized|token.*expired|authentication.*failed') { + # 401 Unauthorized - token issue, refresh will help + $script:AuthFailureDetected = $true + if (-not $script:Auth401MessageShown) { + $script:Auth401MessageShown = $true + Write-LogHost " [AUTH] 401 Unauthorized detected - stopping to re-authenticate" -ForegroundColor Red + } + break # IMMEDIATE EXIT from job error processing + } + } + + # Mark partition as Failed for retry when errors detected + $pt = $jobMeta[$job.Id] + if ($pt -and $script:partitionStatus.ContainsKey($pt.Index)) { + $currentStatus = $script:partitionStatus[$pt.Index].Status + if ($currentStatus -notin 'Complete', 'Failed', 'Subdivided') { + $script:partitionStatus[$pt.Index].Status = 'Failed' + $script:partitionStatus[$pt.Index].LastError = $jobErrors[0].Exception.Message + Write-LogHost " [RETRY-QUEUE] Partition $($pt.Index)/$($pt.Total) marked as Failed for retry (from monitoring loop)" -ForegroundColor Yellow + } + } + } + } + + if ($jobOutput) { + foreach ($output in $jobOutput) { + if ($output -is [string]) { + # Create unique key for deduplication + $msgKey = "$($job.Id):$output" + if ($output -match '^\[ATTEMPT\]') { + $msgKey = "$($job.Id):ATTEMPT" + } + elseif ($output -match '^\[STATUS\] Query running') { + $msgKey = "$($job.Id):STATUS" + } + elseif ($output -match '^\[SUCCESS\]') { + $msgKey = "$($job.Id):SUCCESS" + } + elseif ($output -match '^\[403-CREATE\]|^\[403-FETCH\]') { + # Use the full output as key to deduplicate identical 403 messages + # This prevents "Attempt 2/3" from repeating but allows different attempts to show + $msgKey = "$($job.Id):$output" + } + elseif ($output -match '^\[PROGRESS\]') { + # PROGRESS messages are silent page-count heartbeats — use full output as key (each is unique) + $msgKey = "$($job.Id):$output" + } + elseif ($output -match '^\[NETWORK\]') { + # NETWORK messages include changing elapsed time - deduplicate by partition/page only + # Extract partition and page info for stable key + if ($output -match 'Partition (\d+/\d+).*Page (\d+)') { + $msgKey = "$($job.Id):NETWORK:$($matches[1]):Page$($matches[2])" + } elseif ($output -match 'Partition (\d+/\d+)') { + $msgKey = "$($job.Id):NETWORK:$($matches[1])" + } else { + $msgKey = "$($job.Id):NETWORK" + } + } + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + $script:shownJobMessages[$msgKey] = $true + + # Display messages with appropriate colors + if ($output -match '^\[ATTEMPT\]') { + Write-LogHost $output -ForegroundColor Cyan + } + elseif ($output -match '^\[SENT\]') { + Write-LogHost $output -ForegroundColor DarkGray + # CRITICAL: Extract and store QueryId from SENT message for retry reuse + if ($output -match 'QueryId:\s*([a-f0-9-]+)\)') { + $extractedQueryId = $matches[1] + $pt = $jobMeta[$job.Id] + if ($pt -and $script:partitionStatus.ContainsKey($pt.Index)) { + $script:partitionStatus[$pt.Index].QueryId = $extractedQueryId + } + } + } + elseif ($output -match '^\[ERROR\]') { + Write-LogHost $output -ForegroundColor Red + # Check for 401 in error messages + if ($output -match '401|Unauthorized') { + $script:AuthFailureDetected = $true + if (-not $script:Auth401MessageShown) { + $script:Auth401MessageShown = $true + Write-LogHost " [AUTH] 401 Unauthorized detected in job output" -ForegroundColor Red + } + # CRITICAL: Mark this partition as Failed for retry after re-auth + $pt = $jobMeta[$job.Id] + if ($pt -and $script:partitionStatus.ContainsKey($pt.Index)) { + $script:partitionStatus[$pt.Index].Status = 'Failed' + $script:partitionStatus[$pt.Index].LastError = '401 Unauthorized - token expired' + } + break # IMMEDIATE EXIT + } + } + elseif ($output -match '^\[403-CREATE\]|^\[403-FETCH\]') { + Write-LogHost $output -ForegroundColor Yellow + } + elseif ($output -match '^\[PROGRESS\]') { + # Silent: extract page count, do not display + if ($output -match '\[PROGRESS\] P(\d+)/\d+ pg(\d+)') { + $pIdx = [int]$matches[1]; $pg = [int]$matches[2] + if (-not $script:partitionPageCounts.ContainsKey($pIdx) -or $pg -gt $script:partitionPageCounts[$pIdx]) { + $script:partitionPageCounts[$pIdx] = $pg + } + } + } + elseif ($output -match '^\[NETWORK\]') { + # Also extract page count from NETWORK error messages + if ($output -match 'Partition (\d+)/\d+ Page (\d+)') { + $pIdx = [int]$matches[1]; $pg = [int]$matches[2] + if (-not $script:partitionPageCounts.ContainsKey($pIdx) -or $pg -gt $script:partitionPageCounts[$pIdx]) { + $script:partitionPageCounts[$pIdx] = $pg + } + } + Write-LogHost $output -ForegroundColor Yellow + } + elseif ($output -match '^\[STATUS\] Query running') { + Write-LogHost ($output -replace '^\[STATUS\]\s*','') -ForegroundColor Yellow + } + elseif ($output -match '^\[SUCCESS\]') { + Write-LogHost ($output -replace '^\[SUCCESS\]\s*','') -ForegroundColor Green + } + } + } + elseif ($output -isnot [string]) { + # DEFENSIVE: Detect SplitRequired objects and treat as Failed for retry + if ($output.SplitRequired -eq $true) { + $pt = $jobMeta[$job.Id] + if ($pt -and $script:partitionStatus.ContainsKey($pt.Index)) { + $script:partitionStatus[$pt.Index].Status = 'Failed' + $script:partitionStatus[$pt.Index].LastError = "Query still running after $([Math]::Round($output.ElapsedMinutes,1)) min - queued for retry" + Write-LogHost " [RETRY-QUEUE] Partition $($pt.Index)/$($pt.Total) - query still processing after $([Math]::Round($output.ElapsedMinutes,1)) min, queued for retry" -ForegroundColor Yellow + } + [void]$script:processedJobIds.Add($job.Id) + continue + } + # This is a result object - mark partition as Complete and collect logs immediately + $pt = $jobMeta[$job.Id] + if ($pt) { + + if ($script:partitionStatus.ContainsKey($pt.Index)) { + $currentStatus = $script:partitionStatus[$pt.Index].Status + + if ($currentStatus -ne 'Complete') { + if ([string]::IsNullOrWhiteSpace([string]$output.QueryId) -and [int]($output.RetrievedCount ?? 0) -le 0) { + $script:partitionStatus[$pt.Index].Status = 'Failed' + $script:partitionStatus[$pt.Index].LastError = 'ThreadJob returned empty QueryId with zero records (query was not created/sent)' + Write-LogHost " [DATA-CHECK] Partition $($pt.Index) returned empty QueryId with 0 records - marked Failed for retry" -ForegroundColor Yellow + [void]$script:processedJobIds.Add($job.Id) + continue + } + + $script:partitionStatus[$pt.Index].Status = 'Complete' + $script:partitionStatus[$pt.Index].QueryId = $output.QueryId + $script:partitionStatus[$pt.Index].RecordCount = ($output.RetrievedCount ?? 0) + + # Fallback: emit SUCCESS message if the original [SUCCESS] string was already consumed + $successKey = "$($job.Id):SUCCESS" + if (-not $script:shownJobMessages.ContainsKey($successKey)) { + $script:shownJobMessages[$successKey] = $true + Write-LogHost "Query succeeded - Partition $($pt.Index)/$($pt.Total) - Query ID: $($output.QueryId) - Retrieved $($output.RetrievedCount ?? 0) records" -ForegroundColor Green + } + + # Activate streaming export if thread persisted data to disk (thread-side save path: Logs returned empty) + if ($output.ThreadSavedToDisk -and $script:memoryFlushEnabled -and -not $script:memoryFlushed) { + $script:memoryFlushed = $true + Write-LogHost " [MEMORY] Thread-side persistence active - streaming export will use JSONL files" -ForegroundColor Yellow + } + + # Add logs to collection (skip if memory flush enabled - using JSONL only) + # Receive-Job can only be called once, so we must collect now + if ($output.Logs -and $output.Logs.Count -gt 0 -and -not $script:memoryFlushEnabled) { + $allLogs.AddRange($output.Logs) + } + + # Update aggregate metrics only (per-record activity breakdown happens in explosion phase) + # CRITICAL: Do NOT iterate through logs here - it blocks the monitoring loop + try { + $script:metrics.QueryMs += [int]$output.ElapsedMs + $script:metrics.TotalRecordsFetched += [int]$output.RetrievedCount + } catch {} + + # Mark partition complete in checkpoint (so Ctrl+C shows accurate count) + if ($script:CheckpointEnabled) { + Save-Checkpoint -PartitionIndex $pt.Index -QueryId $output.QueryId -State 'Completed' -RecordCount $output.RetrievedCount + } + + # INCREMENTAL SAVE: Write partition records to disk immediately (prevents data loss on auth failure) + if ($output.Logs -and $output.Logs.Count -gt 0) { + try { + $incrementalDir = Join-Path (Split-Path $script:PartialOutputPath -Parent) ".pax_incremental" + if (-not (Test-Path $incrementalDir)) { + New-Item -ItemType Directory -Path $incrementalDir -Force | Out-Null + } + $incrementalFile = Join-Path $incrementalDir "Part$($pt.Index)_${global:ScriptRunTimestamp}_qid-$($output.QueryId)_$($output.RetrievedCount)records.jsonl" + # Write as JSON Lines (NDJSON) - one record per line for recoverability + $output.Logs | ForEach-Object { $_ | ConvertTo-Json -Depth 10 -Compress } | Out-File -FilePath $incrementalFile -Encoding utf8 -Force + Write-LogHost " [SAVE] Partition $($pt.Index): $($output.RetrievedCount) records saved to disk" -ForegroundColor DarkGreen + + # Clear reference to allow GC (critical for memory management) + $output.Logs = $null + + # MEMORY MANAGEMENT: Mark if we're using JSONL-only mode + if ($script:memoryFlushEnabled -and -not $script:memoryFlushed) { + $script:memoryFlushed = $true + Write-LogHost " [MEMORY] Memory management active - data written to JSONL only (streaming export at end)" -ForegroundColor Yellow + } + } catch { + Write-LogHost " [WARN] Failed to save incremental data for Partition $($pt.Index): $($_.Exception.Message)" -ForegroundColor Yellow + } + } + + # Mark job as processed since we've already collected its output + [void]$script:processedJobIds.Add($job.Id) + } else { + } + } else { + } + } else { + } + } + } + } + } catch { + # Silently continue if we can't receive from a job + } + } + + # Check if 60 seconds have passed since last status update (or first iteration) + $elapsedSinceLastUpdate = ((Get-Date) - $lastStatusUpdate).TotalSeconds + if ($firstStatus -or $elapsedSinceLastUpdate -ge $StatusIntervalSeconds) { + # Count partitions by status (EXCLUDE 'Subdivided' parent partitions) + $activeStatuses = @($script:partitionStatus.Values) | Where-Object { $_.Status -ne 'Subdivided' } + $completedPartitions = @($activeStatuses | Where-Object { $_.Status -eq 'Complete' }).Count + $jobCreatedPartitions = @($activeStatuses | Where-Object { $_.Status -eq 'JobCreated' }).Count + $notStartedPartitions = @($activeStatuses | Where-Object { $_.Status -eq 'NotStarted' }).Count + $failedCount = @($activeStatuses | Where-Object { $_.Status -eq 'Failed' }).Count + + # Calculate remaining and total (only active partitions) + $remainingToComplete = $jobCreatedPartitions + $notStartedPartitions + $totalPartitions = @($activeStatuses).Count + + $statusLine = "[$(Get-Date -Format 'HH:mm:ss')] Total Queries: $($totalPartitions + $script:resumeCompletedOffset) | Completed: $($completedPartitions + $script:resumeCompletedOffset) | Remaining: $remainingToComplete" + if ($failedCount -gt 0) { + $statusLine += " | Failed: $failedCount" + } + if ($script:partitionPageCounts -and $script:partitionPageCounts.Count -gt 0) { + $pageStr = ($script:partitionPageCounts.GetEnumerator() | Sort-Object Key | ForEach-Object { "P$($_.Key):$($_.Value.ToString('N0'))pg" }) -join ' ' + $statusLine += " | Pages $pageStr" + } + + Write-LogHost $statusLine -ForegroundColor White + + $lastStatusUpdate = Get-Date + $firstStatus = $false + } + + # PROACTIVE TOKEN REFRESH (ALL auth modes): Refresh before expiration + # Uses SharedAuthState.ExpiresOn to determine when refresh is needed + # This prevents 401 errors during long-running job monitoring + # Thread jobs read from SharedAuthState, so they get the fresh token automatically + $refreshResult = Refresh-GraphTokenIfNeeded -BufferMinutes 5 + # CRITICAL: Use -is [string] check to avoid PowerShell coercion bug where $true -eq 'Quit' returns True + if ($refreshResult -is [string] -and $refreshResult -eq 'Quit') { + # User chose to quit at auth prompt - save checkpoint and exit gracefully + if ($script:CheckpointEnabled) { Save-Checkpoint -Force } + Show-CheckpointExitMessage + exit 0 + } + + # REACTIVE AUTH CHECK: If 401 detected, handle token refresh + # Use AuthPromptInProgress as debounce to prevent multiple simultaneous auth prompts + if ($script:AuthFailureDetected -and -not $script:AuthPromptInProgress) { + $script:AuthPromptInProgress = $true # Set debounce flag + Write-LogHost "" + Write-LogHost " [AUTH] Authentication failure detected - pausing job monitoring" -ForegroundColor Red + + # AppRegistration mode: Automatically refresh token (silent, no prompt) + # Interactive modes: Prompt user for re-authentication + if ($script:AuthConfig.Method -eq 'AppRegistration' -and $script:AuthConfig.CanReauthenticate) { + Write-LogHost " [AUTH] Attempting automatic token refresh for AppRegistration..." -ForegroundColor Yellow + $refreshResult = Invoke-TokenRefresh -Force + + if (-not $refreshResult.Success -or -not $refreshResult.NewToken) { + Write-LogHost " [AUTH] Automatic token refresh failed: $($refreshResult.Message)" -ForegroundColor Red + Write-LogHost " [AUTH] Cannot continue without valid authentication." -ForegroundColor Red + if ($script:CheckpointEnabled) { + Save-Checkpoint -Force + } + Write-LogHost " Exiting due to authentication failure. Use -Resume to continue later." -ForegroundColor Yellow + return + } + + # CRITICAL: Update $accessToken with fresh token for retry phase + $accessToken = $refreshResult.NewToken + # CRITICAL: Update shared auth state so thread jobs see the fresh token + $script:SharedAuthState.Token = $refreshResult.NewToken + $script:SharedAuthState.ExpiresOn = (Get-Date).ToUniversalTime().AddMinutes(50) + $script:SharedAuthState.LastRefresh = Get-Date + Write-LogHost " [AUTH] Token refreshed successfully" -ForegroundColor Green + $script:AuthFailureDetected = $false + $script:Auth401MessageShown = $false # Reset for next auth failure cycle + $script:AuthPromptInProgress = $false # Reset debounce flag + } else { + # Interactive mode - prompt user + $reauthResult = Invoke-TokenRefreshPrompt + + if ($reauthResult -eq 'Quit') { + # User chose to quit - save checkpoint and exit gracefully + if ($script:CheckpointEnabled) { + Save-Checkpoint -Force + } + Write-LogHost " Exiting due to user request. Use -Resume to continue later." -ForegroundColor Yellow + return + } + + # CRITICAL: Get fresh token after interactive reauth + $accessToken = Get-GraphAccessToken + if (-not $accessToken) { + Write-LogHost " [AUTH] FATAL: Could not obtain access token after re-authentication" -ForegroundColor Red + if ($script:CheckpointEnabled) { + Save-Checkpoint -Force + } + Write-LogHost " Exiting due to token extraction failure. Use -Resume to continue later." -ForegroundColor Yellow + return + } + Write-LogHost " [AUTH] Fresh token obtained for retry phase" -ForegroundColor Green + # CRITICAL: Update shared auth state so thread jobs see the fresh token + $script:SharedAuthState.Token = $accessToken + $script:SharedAuthState.ExpiresOn = (Get-Date).ToUniversalTime().AddMinutes(50) + $script:SharedAuthState.LastRefresh = Get-Date + # CRITICAL: Reset auth failure flags after successful interactive re-auth + # Without this, old 401 errors in job buffers re-trigger the auth prompt + $script:AuthFailureDetected = $false + $script:Auth401MessageShown = $false + $script:AuthPromptInProgress = $false # Reset debounce flag + } + + # Re-authenticated successfully - token updated for retry phase + Write-LogHost " [AUTH] Resuming job monitoring with fresh token" -ForegroundColor Green + + # FIX D: Drain all job buffers to clear old 401 errors + # Without this, old errors in buffers re-trigger auth detection on next loop iteration + foreach ($drainJob in $jobs) { + if ($drainJob.State -ne 'Completed' -and $drainJob.State -ne 'Failed') { + try { + # Drain WITHOUT -Keep to clear the buffers + $null = Receive-Job -Job $drainJob -ErrorAction SilentlyContinue + } catch { } + } + } + Write-LogHost " [AUTH] Drained job buffers to clear old error messages" -ForegroundColor DarkGray + } + + Start-Sleep -Milliseconds 500 + } + + # Process completed jobs and collect results + $subdivisionOccurred = $false + # CRITICAL: Check error stream - jobs with State='Completed' but errors should be treated as failures + $completedNow = $jobs | Where-Object { $_.State -eq 'Completed' -and -not $script:processedJobIds.Contains($_.Id) -and $_.Error.Count -eq 0 } + + foreach ($job in $completedNow) { + $res = $null + try { + # Receive all output from job (includes Write-Output debug messages) + $allOutput = Receive-Job -Job $job -ErrorAction Stop + + # Filter output: PSCustomObject is the result, strings are debug messages + $debugMessages = $allOutput | Where-Object { $_ -is [string] } + $res = $allOutput | Where-Object { $_ -isnot [string] } | Select-Object -First 1 + + # Display [SENT] messages to terminal, log all debug messages to file + foreach ($debugMsg in $debugMessages) { + # Show [SENT] messages in terminal with color (with deduplication) + if ($debugMsg -match '^\[SENT\]') { + $msgKey = "$($job.Id):$debugMsg" + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + Write-LogHost $debugMsg -ForegroundColor DarkGray + $script:shownJobMessages[$msgKey] = $true + } + # CRITICAL: Extract and store QueryId from SENT message for retry reuse + if ($debugMsg -match 'QueryId:\s*([a-f0-9-]+)\)') { + $extractedQueryId = $matches[1] + $pt = $jobMeta[$job.Id] + if ($pt -and $script:partitionStatus.ContainsKey($pt.Index)) { + $script:partitionStatus[$pt.Index].QueryId = $extractedQueryId + } + } + } + + $debugTimestamp = Get-Date -Format 'yyyy-MM-dd HH:mm:ss' + + if ($debugMsg -match '^\[GRAPH-(WARN|ERROR)\]' -or $debugMsg -like 'Graph API Query Body*' -or $debugMsg -like 'API Stored Query Details*' -or $debugMsg -match '^\[NETWORK\]' -or $debugMsg -match '^\[ERROR\]') { + Write-LogHost $debugMsg -ForegroundColor DarkGray + } + } + } catch { + Write-LogHost " ✗ Error receiving job: $($_.Exception.Message)" -ForegroundColor Red + + # Mark as Failed for retry (job exceptions need explicit status update) + $pt = $jobMeta[$job.Id] + if ($pt -and $script:partitionStatus.ContainsKey($pt.Index)) { + $script:partitionStatus[$pt.Index].Status = 'Failed' + $script:partitionStatus[$pt.Index].LastError = $_.Exception.Message + Write-LogHost " [RETRY-QUEUE] Partition $($pt.Index)/$($pt.Total) marked as Failed for retry" -ForegroundColor Yellow + } + # Mark job as processed to avoid duplicate processing + [void]$script:processedJobIds.Add($job.Id) + continue # Skip to next job - this one failed + } + + $pt = $jobMeta[$job.Id] + + if ($null -ne $res) { + # CHECK FOR SUBDIVISION SIGNAL: Job returned needs_subdivision status + if ($res.Status -eq 'needs_subdivision') { + Write-LogHost "[SUBDIVISION] Partition $($pt.Index)/$($pt.Total) requires subdivision - Preview count: $($res.PreviewCount)" -ForegroundColor Yellow + + # Calculate subdivision time windows + $partitionSpan = $res.PartitionEnd - $res.PartitionStart + $partitionHours = $partitionSpan.TotalHours + + # SMART SUBDIVISION: Analyze timestamp distribution in returned records + # to calculate optimal subdivision size instead of just dividing by 2 + $subdivisionFactor = 2 # Default to half + + if ($res.Logs -and $res.Logs.Count -eq 10000) { + try { + # Get timestamps from returned records + $timestamps = @() + foreach ($log in $res.Logs) { + if ($log.CreationTime) { + $ts = script:Parse-DateSafe $log.CreationTime + if ($ts) { $timestamps += $ts } + } + } + + if ($timestamps.Count -gt 100) { + # Sort timestamps and find the last one (most recent in the 10k batch) + $sortedTimestamps = $timestamps | Sort-Object + $lastTimestamp = $sortedTimestamps[-1] + $firstTimestamp = $sortedTimestamps[0] + + # Calculate how much of the partition timespan was covered by the 10k records + $coveredSpan = ($lastTimestamp - $firstTimestamp).TotalHours + $totalSpan = ($res.PartitionEnd - $res.PartitionStart).TotalHours + + if ($coveredSpan -gt 0 -and $coveredSpan -lt $totalSpan) { + # Calculate records per hour for the covered span + $recordsPerHour = 10000 / $coveredSpan + + # Calculate target hours to get ~8000 records (buffer below 10k) + $targetHours = 8000 / $recordsPerHour + + # Calculate subdivision factor (but cap it to avoid tiny partitions) + $minSubdivisionHours = 0.001389 * 24 # 2 minutes + if ($targetHours -ge $minSubdivisionHours) { + $subdivisionFactor = [Math]::Max(2, [Math]::Ceiling($totalSpan / $targetHours)) + Write-LogHost " [SMART SUBDIVISION] Analyzed 10k records: covered $([Math]::Round($coveredSpan,2))h of $([Math]::Round($totalSpan,2))h span" -ForegroundColor Cyan + Write-LogHost " [SMART SUBDIVISION] Estimated $([Math]::Round($recordsPerHour,0)) records/hour → dividing by $subdivisionFactor instead of 2" -ForegroundColor Cyan + } + } + } + } catch { + Write-LogHost " [SMART SUBDIVISION] Timestamp analysis failed, using default subdivision: $_" -ForegroundColor Yellow + } + } + + # Calculate subdivision size + $subPartitionHours = $partitionHours / $subdivisionFactor + $subPartitionSpan = [TimeSpan]::FromHours($subPartitionHours) + + # Create sub-partitions (divide evenly based on calculated factor) + $newSubPartitions = @() + for ($i = 0; $i -lt $subdivisionFactor; $i++) { + $subStart = $res.PartitionStart + ([TimeSpan]::FromHours($i * $subPartitionHours)) + $subEnd = if ($i -eq ($subdivisionFactor - 1)) { + $res.PartitionEnd # Last partition goes to the end + } else { + $res.PartitionStart + ([TimeSpan]::FromHours(($i + 1) * $subPartitionHours)) + } + + $newSubPartitions += [PSCustomObject]@{ + PStart = $subStart + PEnd = $subEnd + Index = $null + Total = $null + ParentIndex = $pt.Index + SubdivisionLevel = if ($pt.SubdivisionLevel) { $pt.SubdivisionLevel + 1 } else { 1 } + } + } + + Write-LogHost " Creating $($newSubPartitions.Count) sub-partitions:" -ForegroundColor DarkYellow + foreach ($subPt in $newSubPartitions) { + $subHours = ($subPt.PEnd - $subPt.PStart).TotalHours + Write-LogHost " $($subPt.PStart.ToString('yyyy-MM-dd HH:mm')) to $($subPt.PEnd.ToString('yyyy-MM-dd HH:mm')) ($([Math]::Round($subHours,2))h)" -ForegroundColor DarkYellow + } + + # Add sub-partitions to partitions array + $partitions += $newSubPartitions + + # Save old status data before re-indexing (indexed by old Index values) + $oldStatusData = @{} + foreach ($key in $script:partitionStatus.Keys) { + $oldStatusData[$key] = $script:partitionStatus[$key] + } + + # Re-index all partitions + $newTotal = $partitions.Count + for ($i = 0; $i -lt $partitions.Count; $i++) { + $partitions[$i].Index = $i + 1 + $partitions[$i].Total = $newTotal + } + + # Rebuild partition status dictionary with new indexes + $script:partitionStatus = @{} + $script:partitionPageCounts = @{} + $script:jobOutputOffset = @{} + foreach ($partition in $partitions) { + # Find if this partition had existing status data (by object reference or parent index) + $existingStatus = $null + foreach ($oldStatus in $oldStatusData.Values) { + if ([object]::ReferenceEquals($oldStatus.Partition, $partition)) { + $existingStatus = $oldStatus + break + } + } + + if ($existingStatus) { + # Preserve existing status data but use new Index + $script:partitionStatus[$partition.Index] = @{ + Partition = $partition + AttemptNumber = $existingStatus.AttemptNumber + QueryId = $existingStatus.QueryId + QueryName = $existingStatus.QueryName + Status = $existingStatus.Status + LastError = $existingStatus.LastError + RecordCount = $existingStatus.RecordCount + ParentPartition = $existingStatus.ParentPartition + SubdivisionReason = $existingStatus.SubdivisionReason + } + } else { + # New sub-partition - initialize fresh + $script:partitionStatus[$partition.Index] = @{ + Partition = $partition + AttemptNumber = 0 + QueryId = $null + QueryName = $null + Status = 'NotStarted' + LastError = $null + RecordCount = 0 + ParentPartition = $pt.Index + } + } + } + + # Mark parent partition as Subdivided (now using new Index from re-indexed partition) + $parentPartition = $partitions | Where-Object { [object]::ReferenceEquals($_, $pt) } | Select-Object -First 1 + if ($parentPartition -and $script:partitionStatus.ContainsKey($parentPartition.Index)) { + $script:partitionStatus[$parentPartition.Index].Status = 'Subdivided' + $script:partitionStatus[$parentPartition.Index].RecordCount = 0 + $script:partitionStatus[$parentPartition.Index].SubdivisionReason = "preview_count_$($res.PreviewCount)" + } + + # Mark as processed to avoid duplicate handling + [void]$script:processedJobIds.Add($job.Id) + + # Set flag to break out of inner loop and re-queue new sub-partitions + $subdivisionOccurred = $true + Write-LogHost " [SUBDIV-DEBUG] After creating sub-partitions: Total partitions=$($partitions.Count)" -ForegroundColor Magenta + break # Break from the job processing loop to re-start with new sub-partitions + } + elseif ($res.TokenExpired -eq $true) { + # FIX C: Token expired in job - mark for retry with fresh token + Write-LogHost "[TOKEN-RETRY] Partition $($pt.Index)/$($pt.Total) returned due to expired token - will retry with fresh token" -ForegroundColor Yellow + + # Reset partition status to allow retry + if ($script:partitionStatus.ContainsKey($pt.Index)) { + $script:partitionStatus[$pt.Index].Status = 'NotStarted' + $script:partitionStatus[$pt.Index].LastError = 'Token expired - retry pending' + # Preserve QueryId if present for retry reuse + if ($res.QueryId) { + $script:partitionStatus[$pt.Index].QueryId = $res.QueryId + } + } + + # Mark job as processed + [void]$script:processedJobIds.Add($job.Id) + + # Set auth failure flag to trigger token refresh before retry + $script:AuthFailureDetected = $true + } + else { + # NORMAL COMPLETION: Not a subdivision - process as completed query + # Track whether data was actually added to collection + $dataAddedToCollection = $false + $recordsBeforeAdd = $allLogs.Count + + # Add logs to collection (skip when memory flush enabled - data goes to JSONL only) + if ($res.Logs -and $res.Logs.Count -gt 0 -and -not $script:memoryFlushEnabled) { + foreach ($log in $res.Logs) { + [void]$allLogs.Add($log) + } + $dataAddedToCollection = ($allLogs.Count > $recordsBeforeAdd) + } + + # Update partition status tracking - mark Complete if query succeeded (even with 0 records) + if ($script:partitionStatus.ContainsKey($pt.Index)) { + $queryName = "PAX_Query_$($pt.PStart.ToString('yyyyMMdd_HHmm'))-$($pt.PEnd.ToString('yyyyMMdd_HHmm'))_Part$($pt.Index)/$($pt.Total)" + $script:partitionStatus[$pt.Index].QueryName = $queryName + $script:partitionStatus[$pt.Index].Status = 'Complete' + $script:partitionStatus[$pt.Index].QueryId = $res.QueryId + $script:partitionStatus[$pt.Index].RecordCount = $res.RetrievedCount + + # CHECKPOINT: Save Completed state - this partition's data is now fully fetched + if ($script:CheckpointEnabled) { + Save-Checkpoint -PartitionIndex $pt.Index -QueryId $res.QueryId -State 'Completed' + } + + # INCREMENTAL SAVE: Write partition records to disk immediately (prevents data loss on auth failure) + if ($res.Logs -and $res.Logs.Count -gt 0) { + try { + $incrementalDir = Join-Path (Split-Path $script:PartialOutputPath -Parent) ".pax_incremental" + if (-not (Test-Path $incrementalDir)) { + New-Item -ItemType Directory -Path $incrementalDir -Force | Out-Null + } + $incrementalFile = Join-Path $incrementalDir "Part$($pt.Index)_${global:ScriptRunTimestamp}_qid-$($res.QueryId)_$($res.RetrievedCount)records.jsonl" + # Write as JSON Lines (NDJSON) - one record per line for recoverability + $res.Logs | ForEach-Object { $_ | ConvertTo-Json -Depth 10 -Compress } | Out-File -FilePath $incrementalFile -Encoding utf8 -Force + Write-LogHost " [SAVE] Partition $($pt.Index): $($res.RetrievedCount) records saved to disk" -ForegroundColor DarkGreen + } catch { + Write-LogHost " [WARN] Failed to save incremental data for Partition $($pt.Index): $($_.Exception.Message)" -ForegroundColor Yellow + } + } + } + + # Collect telemetry if present + if ($res.Telemetry) { + $script:telemetryData += $res.Telemetry + } + + # Show throttle retry summary if any retries occurred + if ($res.Telemetry -and $res.Telemetry.ThrottleRetriesDuringCreation -and $res.Telemetry.ThrottleRetriesDuringCreation -gt 0) { + Write-LogHost " [!] Throttled during query creation: $($res.Telemetry.ThrottleRetriesDuringCreation) retry(s)" -ForegroundColor Yellow + } + + try { + $script:metrics.QueryMs += [int]$res.ElapsedMs + $script:metrics.TotalRecordsFetched += [int]$res.RetrievedCount + # Count records by their actual Operation value, not by query group name + if ($res.Logs -and $res.Logs.Count -gt 0) { + foreach ($log in $res.Logs) { + # Handle both Operation (EOM format) and Operations (Graph API normalized format) + $actualOperation = if ($log.Operation) { $log.Operation } elseif ($log.Operations) { $log.Operations } else { $null } + if (-not [string]::IsNullOrWhiteSpace($actualOperation)) { + if (-not $script:metrics.Activities.ContainsKey($actualOperation)) { + $script:metrics.Activities[$actualOperation] = @{ Retrieved = 0; Structured = 0 } + } + $script:metrics.Activities[$actualOperation].Retrieved += 1 + } + } + } + } catch {} + } # End of else block for normal completion + + + # Capture diagnostic output for any failed jobs so payload logging is persisted + $failedNow = $jobs | Where-Object { $_.State -eq 'Failed' -and (-not $script:processedJobIds.Contains($_.Id)) } + foreach ($job in $failedNow) { + $pt = $jobMeta[$job.Id] + try { + $failOutput = Receive-Job -Job $job -ErrorAction SilentlyContinue + } + catch { + $failOutput = $null + } + + if ($failOutput) { + foreach ($msg in $failOutput) { + if ($msg -is [string]) { + $msgKey = "$($job.Id):$msg" + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + $script:shownJobMessages[$msgKey] = $true + } + + $debugTimestamp = Get-Date -Format 'yyyy-MM-dd HH:mm:ss' + + if ($msg -match '^[\[]?(GRAPH-(WARN|ERROR)|ERROR|NETWORK|ATTEMPT|SENT)') { + Write-LogHost $msg -ForegroundColor DarkGray + } + elseif ($msg -like 'Graph API Query Body*' -or $msg -like 'API Stored Query Details*') { + Write-LogHost $msg -ForegroundColor DarkGray + } + else { + Write-LogHost $msg -ForegroundColor DarkGray + } + } + } + } + + if ($pt -and $script:partitionStatus.ContainsKey($pt.Index)) { + $script:partitionStatus[$pt.Index].Status = 'Failed' + $script:partitionStatus[$pt.Index].LastError = 'ThreadJob failed before completion' + } + + [void]$script:processedJobIds.Add($job.Id) + } + + # Handle jobs that completed with errors in error stream (State='Completed' but had Write-Error calls) + $completedWithErrors = $jobs | Where-Object { $_.State -eq 'Completed' -and (-not $script:processedJobIds.Contains($_.Id)) -and $_.Error.Count -gt 0 } + foreach ($job in $completedWithErrors) { + $pt = $jobMeta[$job.Id] + $errorMsg = ($job.Error | Select-Object -First 1).ToString() + Write-LogHost "[ERROR-STREAM] Partition $($pt.Index)/$($pt.Total) - Job completed but had error: $errorMsg" -ForegroundColor Red + + # Capture diagnostic output from the job + try { + $errorOutput = Receive-Job -Job $job -ErrorAction SilentlyContinue + if ($errorOutput) { + foreach ($msg in $errorOutput) { + if ($msg -is [string]) { + $msgKey = "$($job.Id):$msg" + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + $script:shownJobMessages[$msgKey] = $true + if ($msg -match '^[\[]?(GRAPH-(WARN|ERROR)|ERROR|NETWORK|ATTEMPT|SENT)') { + Write-LogHost $msg -ForegroundColor DarkGray + } + } + } + } + } + } catch {} + + # Mark partition as Failed for retry + if ($pt -and $script:partitionStatus.ContainsKey($pt.Index)) { + $script:partitionStatus[$pt.Index].Status = 'Failed' + $script:partitionStatus[$pt.Index].LastError = $errorMsg + Write-LogHost " [RETRY-QUEUE] Partition $($pt.Index)/$($pt.Total) marked as Failed for retry due to error stream" -ForegroundColor Yellow + } + + [void]$script:processedJobIds.Add($job.Id) + } + # Note: Logs already added to $allLogs earlier (moved above to validate data retrieval) + + # Update progress - in parallel mode, each partition = 1 unit of progress + $script:progressState.Query.Current = [Math]::Min($script:progressState.Query.Current + 1, $script:progressState.Query.Total) + Update-Progress -Status ("Partition {0}/{1} complete" -f $pt.Index, $pt.Total) + $qc = $script:progressState.Query.Current + $qt = $script:progressState.Query.Total + Write-ProgressTick + } + + # Mark job as processed to avoid duplicate processing + [void]$script:processedJobIds.Add($job.Id) + } + + # If subdivision occurred, break to queue new sub-partitions + if ($subdivisionOccurred) { + Write-LogHost " Subdivision detected - queuing new sub-partitions..." -ForegroundColor Yellow + # Don't wait for all jobs to complete, loop back now to queue subdivided partitions + Write-LogHost " [SUBDIV-DEBUG] After break, outer loop should check for pending NotStarted partitions" -ForegroundColor Magenta + break + } + + # All jobs completed - begin retry and reconciliation phase + $finalStates = $jobs | Group-Object -Property State | ForEach-Object { "$($_.Name):$($_.Count)" } + Write-LogHost " Initial job execution complete. States: $($finalStates -join ', ')" -ForegroundColor DarkCyan + + # Mark any completed jobs as Complete if status wasn't already updated + # (This handles jobs where result object wasn't captured but job completed successfully) + # CRITICAL: Check for job errors first - ThreadJobs that throw exceptions still end up with State='Completed' + $completedJobs = $jobs | Where-Object { $_.State -eq 'Completed' } + foreach ($completedJob in $completedJobs) { + $pt = $jobMeta[$completedJob.Id] + if ($script:partitionStatus.ContainsKey($pt.Index)) { + # Check if job produced errors (401, network failures, thrown exceptions) + $jobErrors = @() + try { + $jobErrors = @($completedJob.ChildJobs | ForEach-Object { $_.Error } | Where-Object { $_ }) + # ThreadJobs store errors directly in .Error, not in ChildJobs + if ($jobErrors.Count -eq 0 -and $completedJob.Error.Count -gt 0) { + $jobErrors = @($completedJob.Error) + } + } catch {} + + if ($jobErrors.Count -gt 0 -and $script:partitionStatus[$pt.Index].Status -notin @('Complete', 'Subdivided')) { + # Job completed WITH errors - mark as Failed for retry + $script:partitionStatus[$pt.Index].Status = 'Failed' + $script:partitionStatus[$pt.Index].LastError = $jobErrors[0].ToString() + # QueryId preserved — query may still be valid on Purview for retry resume + Write-LogHost " [ERROR-CHECK] Partition $($pt.Index) job completed with errors - marked Failed for retry" -ForegroundColor Yellow + } + elseif ($script:partitionStatus[$pt.Index].Status -in @('NotStarted','JobCreated')) { + # Validate that partition actually saved data before marking Complete + # ThreadJobs that hit 504/network errors internally may complete with State='Completed' + # but never save any data to disk. Check for JSONL file existence before trusting the job state. + $hasDataOnDisk = $false + try { + $validationIncrDir = Join-Path (Split-Path $script:PartialOutputPath -Parent) ".pax_incremental" + if (Test-Path $validationIncrDir) { + $partJsonl = Get-ChildItem -Path $validationIncrDir -Filter "Part$($pt.Index)_${global:ScriptRunTimestamp}_*.jsonl" -ErrorAction SilentlyContinue + $hasDataOnDisk = ($partJsonl -and @($partJsonl).Count -gt 0) + } + } catch {} + + if ($hasDataOnDisk) { + # JSONL file exists — partition genuinely completed + $script:partitionStatus[$pt.Index].Status = 'Complete' + # Job completed before monitoring loop polled it; result object was never + # received, so ThreadSavedToDisk was never processed. Activate streaming export here. + if ($script:memoryFlushEnabled -and -not $script:memoryFlushed) { + $script:memoryFlushed = $true + Write-LogHost " [MEMORY] Thread-side JSONL confirmed in reconciliation - streaming export activated" -ForegroundColor Yellow + } + } else { + # No JSONL file — job finished but never saved data (likely 504/auth failure swallowed internally) + $script:partitionStatus[$pt.Index].Status = 'Failed' + $script:partitionStatus[$pt.Index].LastError = 'Job completed without saving data (no JSONL file found) - likely network/auth failure swallowed by retry logic' + Write-LogHost " [DATA-CHECK] Partition $($pt.Index) job completed but NO data saved to disk - marked Failed for retry" -ForegroundColor Yellow + } + } + elseif ($script:partitionStatus[$pt.Index].Status -eq 'Complete') { + $queryIdMissing = [string]::IsNullOrWhiteSpace([string]$script:partitionStatus[$pt.Index].QueryId) + $recordCountZero = ([int]($script:partitionStatus[$pt.Index].RecordCount ?? 0) -le 0) + if ($queryIdMissing -and $recordCountZero) { + $script:partitionStatus[$pt.Index].Status = 'Failed' + $script:partitionStatus[$pt.Index].LastError = 'Partition marked Complete but has empty QueryId and zero records' + Write-LogHost " [DATA-CHECK] Partition $($pt.Index) had invalid completion state (empty QueryId + 0 records) - marked Failed for retry" -ForegroundColor Yellow + } + } + } + } + + # Track failed jobs and update status + $failedJobs = $jobs | Where-Object { $_.State -eq 'Failed' } + foreach ($failedJob in $failedJobs) { + $pt = $jobMeta[$failedJob.Id] + if ($script:partitionStatus.ContainsKey($pt.Index)) { + $script:partitionStatus[$pt.Index].Status = 'Failed' + try { + $errorOutput = Receive-Job -Job $failedJob -ErrorAction SilentlyContinue 2>&1 + if ($errorOutput) { + $lastErrorText = ($errorOutput | Out-String).Trim() + $script:partitionStatus[$pt.Index].LastError = $lastErrorText + + # Log the actual error for diagnostics (useful level) + Write-LogHost " Partition $($pt.Index) error: $lastErrorText" -ForegroundColor DarkYellow + + # 401/Unauthorized failures don't count against retry limit (auth issue, not partition issue) + # This ensures partitions can get full retry attempts after re-authentication + if ($lastErrorText -match '401|Unauthorized') { + Write-LogHost " [AUTH] 401 error for Partition $($pt.Index) - will not count against retry limit" -ForegroundColor Yellow + } + } + } catch { + $script:partitionStatus[$pt.Index].LastError = $_.Exception.Message + } + } + } # RETRY LOGIC: Up to 4 additional attempts (5 total) + $maxAttempts = 5 + $retryPass = 1 + + while ($retryPass -lt $maxAttempts) { + # Find partitions that need retry (Status = Failed or NotStarted only) + $partitionsToRetry = @() + foreach ($idx in $script:partitionStatus.Keys) { + $status = $script:partitionStatus[$idx] + # Only retry actual failures, not queries that completed with 0 records + if ($status.Status -in @('Failed', 'NotStarted')) { + if ($status.AttemptNumber -lt $maxAttempts) { + $partitionsToRetry += $status.Partition + } + } + } + + if ($partitionsToRetry.Count -eq 0) { + Write-LogHost " No partitions require retry" -ForegroundColor Green + break + } + + $retryPass++ + Write-LogHost " [RETRY] Pass $retryPass/$maxAttempts - $($partitionsToRetry.Count) partition(s) need retry" -ForegroundColor Yellow + + # Cooldown before retry + $cooldownSeconds = Get-Random -Minimum 30 -Maximum 60 + Write-LogHost " Waiting $cooldownSeconds seconds before retry..." -ForegroundColor Gray + Start-Sleep -Seconds $cooldownSeconds + + # CRITICAL: Refresh access token before partition retry + # For AppRegistration mode, force re-authentication to get fresh token + # For other modes, use Get-GraphAccessToken helper (HTTP primary) + try { + $tokenObtained = $false + + # Check if we can use forced re-authentication (AppRegistration only) + if ($script:AuthConfig.CanReauthenticate -and $script:AuthConfig.Method -eq 'AppRegistration') { + $refreshResult = Invoke-TokenRefresh -Force + if ($refreshResult.Success -and $refreshResult.NewToken) { + $accessToken = $refreshResult.NewToken + $tokenObtained = $true + Write-LogHost " [TOKEN] Access token refreshed via re-authentication" -ForegroundColor Cyan + } else { + Write-LogHost " [TOKEN] Re-authentication failed: $($refreshResult.Message)" -ForegroundColor Yellow + } + } + + # If AppRegistration refresh failed or using interactive auth, use helper + if (-not $tokenObtained) { + $newToken = Get-GraphAccessToken + if ($newToken) { + if ($newToken -ne $accessToken) { + $accessToken = $newToken + $tokenObtained = $true + Write-LogHost " [TOKEN] Fresh token obtained for retry phase" -ForegroundColor Cyan + } else { + # Token is the same - that's OK if we just re-authenticated + $tokenObtained = $true + Write-LogHost " [TOKEN] Token validated (unchanged but valid)" -ForegroundColor Gray + } + } + } + + # CRITICAL: Block retry if we can't get a valid token + if (-not $tokenObtained) { + Write-LogHost " [TOKEN] FATAL: Cannot obtain valid access token for retry" -ForegroundColor Red + Write-LogHost " [TOKEN] This typically means the session has expired and requires re-authentication" -ForegroundColor Yellow + + # For interactive modes, prompt user to re-authenticate + if ($script:AuthConfig.Method -in @('WebLogin', 'ExistingSession')) { + Write-LogHost " [TOKEN] Prompting for re-authentication..." -ForegroundColor Yellow + $reauthResult = Invoke-TokenRefreshPrompt + + if ($reauthResult -eq 'Quit') { + if ($script:CheckpointEnabled) { Save-Checkpoint -Force } + Write-LogHost " Exiting due to user request. Use -Resume to continue later." -ForegroundColor Yellow + return + } + + # Try getting token again after reauth + $accessToken = Get-GraphAccessToken + if (-not $accessToken) { + Write-LogHost " [TOKEN] FATAL: Still cannot obtain token after re-authentication" -ForegroundColor Red + if ($script:CheckpointEnabled) { Save-Checkpoint -Force } + Write-LogHost " Exiting due to token extraction failure. Use -Resume to continue later." -ForegroundColor Yellow + return + } + Write-LogHost " [TOKEN] Fresh token obtained after re-authentication" -ForegroundColor Green + } else { + # AppRegistration mode with no token - fatal error + if ($script:CheckpointEnabled) { Save-Checkpoint -Force } + Write-LogHost " Exiting due to authentication failure. Use -Resume to continue later." -ForegroundColor Yellow + return + } + } + } catch { + Write-LogHost " [TOKEN] Error during token refresh: $($_.Exception.Message)" -ForegroundColor Red + # Don't proceed with potentially expired token + if ($script:CheckpointEnabled) { Save-Checkpoint -Force } + Write-LogHost " Exiting due to token error. Use -Resume to continue later." -ForegroundColor Yellow + return + } + + # Create retry jobs + $retryJobs = @() + $retryJobMeta = @{} + $retryMaxConcurrency = [Math]::Min($maxConcurrentPartitions, 3) + Write-LogHost " [RETRY] Using reduced retry concurrency: $retryMaxConcurrency (initial phase: $maxConcurrentPartitions)" -ForegroundColor DarkCyan + + foreach ($pt in $partitionsToRetry) { + $script:partitionStatus[$pt.Index].AttemptNumber++ + $script:partitionStatus[$pt.Index].Status = 'NotStarted' # Reset for retry + + Write-LogHost " [RETRY] Attempt $($script:partitionStatus[$pt.Index].AttemptNumber)/$maxAttempts for Partition $($pt.Index)/$($pt.Total)" -ForegroundColor Yellow + + # Re-create the job using the same scriptblock + # CRITICAL: Use $graphResultSize (0 for Graph API = unlimited) instead of $ResultSize (10000) + # Pass existing QueryId if available (for retry after 403 fetch failure) + $existingQueryId = $script:partitionStatus[$pt.Index].QueryId + if ($existingQueryId) { + Write-LogHost " [REUSE] Reusing existing QueryId: $existingQueryId" -ForegroundColor Cyan + } + $job = Start-ThreadJob -ThrottleLimit $retryMaxConcurrency -ScriptBlock $queryJobScriptBlock -ArgumentList $pt.PStart, $pt.PEnd, $activities, $graphResultSize, $UserIds, $pt.Index, $pt.Total, $script:SharedAuthState, $pt, $MaxNetworkOutageMinutes, $script:GraphAuditApiVersion, $script:LogFile, $existingQueryId, $threadIncrementalDir, $threadRunTimestamp, $threadMemoryFlushEnabled + $retryJobs += $job + $retryJobMeta[$job.Id] = $pt + } + + while (($retryJobs | Where-Object { $_.State -in 'Running','NotStarted' }).Count -gt 0) { + foreach ($job in $retryJobs) { + try { + $jobOutput = Receive-Job -Job $job -ErrorAction SilentlyContinue + if ($jobOutput) { + foreach ($output in $jobOutput) { + if ($output -is [string] -and $output -match '^\[SENT\]') { + $msgKey = "$($job.Id):$output" + if (-not $script:shownJobMessages.ContainsKey($msgKey)) { + Write-LogHost $output -ForegroundColor DarkGray + $script:shownJobMessages[$msgKey] = $true + } + # CRITICAL: Extract and store QueryId from SENT message for retry reuse + if ($output -match 'QueryId:\s*([a-f0-9-]+)\)') { + $extractedQueryId = $matches[1] + $pt = $retryJobMeta[$job.Id] + if ($pt -and $script:partitionStatus.ContainsKey($pt.Index)) { + $script:partitionStatus[$pt.Index].QueryId = $extractedQueryId + } + } + } + } + } + } catch {} + } + Start-Sleep -Milliseconds 500 + } + + # Check for retry jobs that completed but have errors in error stream + # These need to be marked Failed so they get re-queued (mirrors initial pass logic) + foreach ($job in $retryJobs) { + if ($job.State -eq 'Completed' -and $job.Error.Count -gt 0) { + $pt = $retryJobMeta[$job.Id] + $errorText = ($job.Error | Out-String).Trim() + Write-LogHost "[ERROR-STREAM] Retry Partition $($pt.Index)/$($pt.Total) - Job completed but had error: $errorText" -ForegroundColor Yellow + if ($script:partitionStatus.ContainsKey($pt.Index)) { + $script:partitionStatus[$pt.Index].Status = 'Failed' + $script:partitionStatus[$pt.Index].LastError = $errorText + Write-LogHost " [RETRY-QUEUE] Partition $($pt.Index)/$($pt.Total) marked as Failed for next retry pass due to error stream" -ForegroundColor Yellow + } + } + } + + # Process retry job results + foreach ($job in $retryJobs) { + $pt = $retryJobMeta[$job.Id] + + # Skip jobs already marked Failed by error-stream check above + if ($script:partitionStatus.ContainsKey($pt.Index) -and $script:partitionStatus[$pt.Index].Status -eq 'Failed') { + continue + } + + if ($job.State -eq 'Completed') { + try { + $allOutput = Receive-Job -Job $job -ErrorAction Stop + $res = $allOutput | Where-Object { $_ -isnot [string] } | Select-Object -First 1 + + if ($null -ne $res -and $script:partitionStatus.ContainsKey($pt.Index)) { + if ([string]::IsNullOrWhiteSpace([string]$res.QueryId) -and [int]($res.RetrievedCount ?? 0) -le 0) { + $script:partitionStatus[$pt.Index].Status = 'Failed' + $script:partitionStatus[$pt.Index].LastError = 'Retry ThreadJob returned empty QueryId with zero records (query was not created/sent)' + Write-LogHost " [DATA-CHECK] Retry Partition $($pt.Index) returned empty QueryId with 0 records - marked Failed for next retry" -ForegroundColor Yellow + continue + } + + $script:partitionStatus[$pt.Index].QueryId = $res.QueryId + $script:partitionStatus[$pt.Index].QueryName = "PAX_Query_$($pt.PStart.ToString('yyyyMMdd_HHmm'))-$($pt.PEnd.ToString('yyyyMMdd_HHmm'))_Part$($pt.Index)/$($pt.Total)" + $script:partitionStatus[$pt.Index].RecordCount = $res.RetrievedCount + $script:partitionStatus[$pt.Index].Status = 'Complete' + + Write-LogHost " Retry successful for Partition $($pt.Index)/$($pt.Total): $($res.RetrievedCount) records" -ForegroundColor Green # Add to allLogs + if ($res.ThreadSavedToDisk -and $script:memoryFlushEnabled -and -not $script:memoryFlushed) { + $script:memoryFlushed = $true + Write-LogHost " [MEMORY] Thread-side persistence active - streaming export will use JSONL files" -ForegroundColor Yellow + } + + # Add to allLogs + if ($res.Logs -and $res.Logs.Count -gt 0) { + foreach ($log in $res.Logs) { + [void]$allLogs.Add($log) + } + + # INCREMENTAL SAVE: Write retry partition records to disk immediately (prevents data loss on auth failure) + # BUG FIX: This was missing from retry path, causing data loss on subsequent failures + try { + $incrementalDir = Join-Path (Split-Path $script:PartialOutputPath -Parent) ".pax_incremental" + if (-not (Test-Path $incrementalDir)) { + New-Item -ItemType Directory -Path $incrementalDir -Force | Out-Null + } + $incrementalFile = Join-Path $incrementalDir "Part$($pt.Index)_${global:ScriptRunTimestamp}_qid-$($res.QueryId)_$($res.RetrievedCount)records.jsonl" + # Write as JSON Lines (NDJSON) - one record per line for recoverability + $res.Logs | ForEach-Object { $_ | ConvertTo-Json -Depth 10 -Compress } | Out-File -FilePath $incrementalFile -Encoding utf8 -Force + Write-LogHost " [SAVE] Retry Partition $($pt.Index): $($res.RetrievedCount) records saved to disk" -ForegroundColor DarkGreen + } catch { + Write-LogHost " [WARN] Failed to save incremental data for Retry Partition $($pt.Index): $($_.Exception.Message)" -ForegroundColor Yellow + } + } + + # Update checkpoint for retry success + if ($script:CheckpointEnabled) { + Save-Checkpoint -PartitionIndex $pt.Index -QueryId $res.QueryId -State 'Completed' -RecordCount $res.RetrievedCount + } + } + } catch { + Write-LogHost " ✗ Retry failed for Partition $($pt.Index)/$($pt.Total): $($_.Exception.Message)" -ForegroundColor Red + if ($script:partitionStatus.ContainsKey($pt.Index)) { + $script:partitionStatus[$pt.Index].Status = 'Failed' + $script:partitionStatus[$pt.Index].LastError = $_.Exception.Message + + # 401/Unauthorized failures don't count against retry limit (auth issue, not partition issue) + if ($_.Exception.Message -match '401|Unauthorized') { + $script:partitionStatus[$pt.Index].AttemptNumber-- + Write-LogHost " [AUTH] 401 error detected - retry attempt not counted (auth issue, not partition issue)" -ForegroundColor Yellow + } + } + } + } elseif ($job.State -eq 'Failed') { + Write-LogHost " ✗ Retry job failed for Partition $($pt.Index)/$($pt.Total)" -ForegroundColor Red + if ($script:partitionStatus.ContainsKey($pt.Index)) { + $script:partitionStatus[$pt.Index].Status = 'Failed' + try { + $errorOutput = Receive-Job -Job $job -ErrorAction SilentlyContinue 2>&1 + if ($errorOutput) { + $lastErrorText = ($errorOutput | Out-String).Trim() + $script:partitionStatus[$pt.Index].LastError = $lastErrorText + + # Log the actual error for diagnostics (useful level) + Write-LogHost " Error details: $lastErrorText" -ForegroundColor DarkYellow + + # 401/Unauthorized failures don't count against retry limit (auth issue, not partition issue) + if ($lastErrorText -match '401|Unauthorized') { + $script:partitionStatus[$pt.Index].AttemptNumber-- + Write-LogHost " [AUTH] 401 error detected - retry attempt not counted (auth issue, not partition issue)" -ForegroundColor Yellow + } + } + } catch {} + } + } + } + + # Cleanup retry jobs + Remove-Job -Job $retryJobs -Force -ErrorAction SilentlyContinue | Out-Null + } + + # FINAL SUMMARY AND RECONCILIATION + Write-LogHost "" -ForegroundColor White + Write-LogHost "═══════════════════════════════════════════════════════════════" -ForegroundColor Cyan + Write-LogHost " QUERY SUBMISSION SUMMARY" -ForegroundColor Cyan + Write-LogHost "═══════════════════════════════════════════════════════════════" -ForegroundColor Cyan + + $totalPartitions = $script:partitionStatus.Count + $completedPartitions = @($script:partitionStatus.Values | Where-Object { $_.Status -eq 'Complete' }) + $sentButIncomplete = @($script:partitionStatus.Values | Where-Object { $_.Status -eq 'Sent' -and $_.QueryId }) + $neverSent = @($script:partitionStatus.Values | Where-Object { $_.Status -in @('Failed', 'NotStarted') -and -not $_.QueryId }) + + Write-LogHost " Total Partitions: $totalPartitions" -ForegroundColor White + Write-LogHost " Sent and Complete: $($completedPartitions.Count)" -ForegroundColor Green + + if ($sentButIncomplete.Count -gt 0) { + Write-LogHost " [!] Sent but Incomplete: $($sentButIncomplete.Count)" -ForegroundColor Yellow + foreach ($status in $sentButIncomplete) { + $pt = $status.Partition + Write-LogHost " - Partition $($pt.Index)/$($pt.Total): QueryName=$($status.QueryName), QueryId=$($status.QueryId)" -ForegroundColor Yellow + } + } + + if ($neverSent.Count -gt 0) { + Write-LogHost " ✗ Never Sent: $($neverSent.Count)" -ForegroundColor Red + foreach ($status in $neverSent) { + $pt = $status.Partition + $errorMsg = if ($status.LastError) { " - Error: $($status.LastError)" } else { "" } + Write-LogHost " - Partition $($pt.Index)/$($pt.Total): QueryName=$($status.QueryName)$errorMsg" -ForegroundColor Red + } + } + + # Check for any missing/skipped partitions (exclude intentionally skipped partitions from resume mode) + $expectedPartitions = 1..$totalPartitions + $attemptedPartitions = @($script:partitionStatus.Keys) + + # In resume mode, use the originally-skipped partition indices (captured at start of run) + # This avoids confusion where partitions completed during THIS run appear as "previously completed" + $intentionallySkipped = @() + if ($script:IsResumeMode -and $script:OriginallySkippedPartitionIndices) { + $intentionallySkipped = $script:OriginallySkippedPartitionIndices + } + + $missingPartitions = $expectedPartitions | Where-Object { $_ -notin $attemptedPartitions -and $_ -notin $intentionallySkipped } + + if ($missingPartitions.Count -gt 0) { + Write-LogHost " [!] MISSING/SKIPPED PARTITIONS: $($missingPartitions.Count)" -ForegroundColor Red + Write-LogHost " Partitions: $($missingPartitions -join ', ')" -ForegroundColor Red + } + + # Show info about intentionally skipped partitions (resume mode) + if ($intentionallySkipped.Count -gt 0) { + Write-LogHost " Previously completed partitions (from checkpoint): $($intentionallySkipped -join ', ')" -ForegroundColor Green + } + + Write-LogHost "═══════════════════════════════════════════════════════════════" -ForegroundColor Cyan + Write-LogHost "" -ForegroundColor White + + # Continue with remaining processing if we have any completed partitions + if ($completedPartitions.Count -eq 0) { + Write-LogHost " ✗ ERROR: No partitions completed successfully. Cannot continue." -ForegroundColor Red + throw "All partitions failed - no data retrieved" + } + + Write-LogHost " Continuing with $($completedPartitions.Count) successful partition(s)..." -ForegroundColor Green + + # Process any remaining completed jobs that weren't caught in the loop + # CRITICAL: Skip jobs with errors - they should have been handled by retry logic + $remainingCompleted = $jobs | Where-Object { $_.State -eq 'Completed' -and -not $script:processedJobIds.Contains($_.Id) -and $_.Error.Count -eq 0 } + foreach ($job in $remainingCompleted) { + $res = $null + try { + $res = Receive-Job -Job $job -ErrorAction Stop + } catch { + Write-LogHost " ERROR receiving job: $($_.Exception.Message)" -ForegroundColor Red + } + + $pt = $jobMeta[$job.Id] # Skip if job metadata not found (shouldn't happen but defensive) + if ($null -eq $pt) { + Write-LogHost " WARNING: Job metadata not found for job ID $($job.Id) - skipping" -ForegroundColor Yellow + [void]$script:processedJobIds.Add($job.Id) + continue + } + + if ($null -ne $res) { + Write-LogHost " Partition $($pt.Index)/$($pt.Total) complete: Retrieved $($res.RetrievedCount) records in $($res.ElapsedMs)ms" -ForegroundColor Cyan + Write-LogHost " Query: $($pt.PStart.ToString('yyyy-MM-dd HH:mm')) to $($pt.PEnd.ToString('yyyy-MM-dd HH:mm')) UTC" -ForegroundColor DarkGray + if ($res.ThreadSavedToDisk -and $script:memoryFlushEnabled -and -not $script:memoryFlushed) { + $script:memoryFlushed = $true + Write-LogHost " [MEMORY] Thread-side persistence active - streaming export will use JSONL files" -ForegroundColor Yellow + } + try { + $script:metrics.QueryMs += [int]$res.ElapsedMs + $script:metrics.TotalRecordsFetched += [int]$res.RetrievedCount + # Count records by their actual Operation value, not by query group name + if ($res.Logs -and $res.Logs.Count -gt 0) { + foreach ($log in $res.Logs) { + # Handle both Operation (EOM format) and Operations (Graph API normalized format) + $actualOperation = if ($log.Operation) { $log.Operation } elseif ($log.Operations) { $log.Operations } else { $null } + if (-not [string]::IsNullOrWhiteSpace($actualOperation)) { + if (-not $script:metrics.Activities.ContainsKey($actualOperation)) { + $script:metrics.Activities[$actualOperation] = @{ Retrieved = 0; Structured = 0 } + } + $script:metrics.Activities[$actualOperation].Retrieved += 1 + } + } + } + } catch {} + + if ($res.Logs -and $res.Logs.Count -gt 0) { + foreach ($log in $res.Logs) { + [void]$allLogs.Add($log) + } + } + + # Update progress - in parallel mode, each partition = 1 unit of progress + $script:progressState.Query.Current = [Math]::Min($script:progressState.Query.Current + 1, $script:progressState.Query.Total) + Update-Progress -Status ("Partition {0}/{1} complete (retry)" -f $pt.Index, $pt.Total) + $qc = $script:progressState.Query.Current + $qt = $script:progressState.Query.Total + Write-ProgressTick + } + + [void]$script:processedJobIds.Add($job.Id) + } + + # Cleanup + Remove-Job -Job $jobs -Force -ErrorAction SilentlyContinue | Out-Null + + # Normalize progress if needed + if ($script:progressState.Query.Current -lt $script:progressState.Query.Total -and $script:progressState.Query.Total -le 200) { + $script:progressState.Query.Current = $script:progressState.Query.Total + Update-Progress -Status 'Parallel partitions complete (normalized)' + } + } + } + catch { + Write-LogHost " Graph API parallel execution error: $($_.Exception.Message)" -ForegroundColor Red + Write-LogHost " Error details: $($_.ScriptStackTrace)" -ForegroundColor Red + throw + } + } # End of while (-not $allPartitionsProcessed) loop - subdivision pass complete + + # Show accurate completion status (only for parallel mode - sequential has its own summary) + if ($canParallel -and $script:partitionStatus) { + $successCount = @($script:partitionStatus.Values | Where-Object { $_.Status -eq 'Complete' }).Count + $failedCount = $partitions.Count - $successCount + if ($failedCount -eq 0) { + Write-LogHost " All $successCount partitions completed" -ForegroundColor Green + } else { + Write-LogHost " [!] $successCount/$($partitions.Count) partitions completed ($failedCount failed)" -ForegroundColor Yellow + } + } + } + # Sequential fallback: Process partitions one-by-one when parallel execution is not available + # This handles: EOM mode (-UseEOM), PS 5.1, or when ThreadJob module is unavailable + if (-not $canParallel) { + $sequentialGroups++ + Write-LogHost " Processing $($partitions.Count) partitions sequentially..." -ForegroundColor DarkCyan + foreach ($pt in $partitions) { + $tq0 = Get-Date + if (-not $UseEOM) { Write-LogHost " Querying partition $($pt.Index)/$($pt.Total) sequentially" -ForegroundColor DarkCyan } + $logs = Invoke-ActivityTimeWindowProcessing -ActivityType $pt.Activity -StartDate $pt.PStart -EndDate $pt.PEnd -PartitionIndex $pt.Index -TotalPartitions $pt.Total -UseEOMMode $UseEOM + $tq1 = Get-Date + try { + $ms = [int]($tq1 - $tq0).TotalMilliseconds + $script:metrics.QueryMs += $ms + if ($logs) { + # Count records by their actual Operation value, not by query group name + $logArray = if ($logs -is [Array]) { $logs } else { @($logs) } + foreach ($log in $logArray) { + # Handle both Operation (EOM format) and Operations (Graph API normalized format) + $actualOperation = if ($log.Operation) { $log.Operation } elseif ($log.Operations) { $log.Operations } else { $null } + if (-not [string]::IsNullOrWhiteSpace($actualOperation)) { + if (-not $script:metrics.Activities.ContainsKey($actualOperation)) { + $script:metrics.Activities[$actualOperation] = @{ Retrieved = 0; Structured = 0 } + } + $script:metrics.Activities[$actualOperation].Retrieved += 1 + } + } + $script:metrics.TotalRecordsFetched += $logArray.Count + # Add to collection + foreach ($item in $logArray) { [void]$allLogs.Add($item) } + } + # Explicit progress tick per sequential partition + $script:progressState.Query.Current = [Math]::Min($script:progressState.Query.Current + 1, $script:progressState.Query.Total) + Write-ProgressTick + } catch { + Write-LogHost " Warning: Error processing partition $($pt.Index): $($_.Exception.Message)" -ForegroundColor Yellow + } + } + Write-LogHost " Sequential processing complete: $($allLogs.Count) records retrieved" -ForegroundColor Green + } + } + $script:CurrentServiceFilter = $null + + # ============================================================================ + # FINAL SAFETY NET: Ensure ALL partitions were completed before export phase + # This catches any partitions that slipped through all retry mechanisms + # ============================================================================ + if ($script:partitionStatus -and $script:partitionStatus.Count -gt 0 -and -not $UseEOM) { + # Find partitions not in terminal success states + $incompletePartitions = @($script:partitionStatus.Values | Where-Object { + $_.Status -notin @('Complete', 'Subdivided') + }) + + if ($incompletePartitions.Count -gt 0) { + Write-LogHost "" -ForegroundColor Yellow + Write-LogHost "============================================================" -ForegroundColor Yellow + Write-LogHost "[FINAL-RECONCILE] $($incompletePartitions.Count) partition(s) incomplete - initiating final recovery" -ForegroundColor Yellow + Write-LogHost "============================================================" -ForegroundColor Yellow + + # List each incomplete partition + foreach ($incomplete in $incompletePartitions) { + $lastErr = if ($incomplete.LastError) { " (Last error: $($incomplete.LastError.Substring(0, [Math]::Min(80, $incomplete.LastError.Length)))...)" } else { "" } + Write-LogHost " Partition $($incomplete.Index): Status=$($incomplete.Status)$lastErr" -ForegroundColor DarkYellow + } + + $maxFinalAttempts = 5 + $finalAttempt = 0 + $recoveredCount = 0 + + while ($finalAttempt -lt $maxFinalAttempts) { + # Re-check which partitions still need recovery + $stillIncomplete = @($script:partitionStatus.Values | Where-Object { + $_.Status -notin @('Complete', 'Subdivided') + }) + + if ($stillIncomplete.Count -eq 0) { + Write-LogHost "[FINAL-RECONCILE] All partitions recovered successfully!" -ForegroundColor Green + break + } + + $finalAttempt++ + Write-LogHost "[FINAL-RECONCILE] Attempt $finalAttempt/$maxFinalAttempts - $($stillIncomplete.Count) partition(s) remaining" -ForegroundColor Yellow + + # Refresh token before retry (critical for long-running exports) + $refreshResult = Refresh-GraphTokenIfNeeded -BufferMinutes 5 + if ($refreshResult -is [string] -and $refreshResult -eq 'Quit') { + if ($script:CheckpointEnabled) { Save-Checkpoint -Force } + Show-CheckpointExitMessage + exit 0 + } + + # Get fresh token for recovery jobs + $recoveryToken = Get-GraphAccessToken + if (-not $recoveryToken) { + Write-LogHost "[FINAL-RECONCILE] Cannot obtain access token - saving checkpoint and exiting" -ForegroundColor Red + if ($script:CheckpointEnabled) { Save-Checkpoint -Force } + Show-CheckpointExitMessage + exit 0 + } + + # Brief cooldown between attempts + if ($finalAttempt -gt 1) { + $cooldown = Get-Random -Minimum 15 -Maximum 30 + Write-LogHost " Waiting $cooldown seconds before retry..." -ForegroundColor Gray + Start-Sleep -Seconds $cooldown + } + + # Process each incomplete partition sequentially (safer for final recovery) + foreach ($incomplete in $stillIncomplete) { + $pt = $incomplete.Partition + if (-not $pt) { + Write-LogHost " [WARN] Partition $($incomplete.Index) has no partition object - cannot recover" -ForegroundColor Red + continue + } + + Write-LogHost " [RECOVER] Retrying Partition $($pt.Index)/$($pt.Total)..." -ForegroundColor Cyan + + try { + # Reset status for retry + $script:partitionStatus[$pt.Index].Status = 'NotStarted' + $script:partitionStatus[$pt.Index].AttemptNumber++ + + # Use sequential processing for recovery (Invoke-ActivityTimeWindowProcessing) + $logs = Invoke-ActivityTimeWindowProcessing -ActivityType $pt.Activity -StartDate $pt.PStart -EndDate $pt.PEnd -PartitionIndex $pt.Index -TotalPartitions $pt.Total -UseEOMMode $false + + if ($logs) { + $logArray = if ($logs -is [Array]) { $logs } else { @($logs) } + + # Add to $allLogs + foreach ($item in $logArray) { [void]$allLogs.Add($item) } + + # Save to JSONL for streaming export + $incrementalDir = Join-Path (Split-Path $script:PartialOutputPath -Parent) ".pax_incremental" + if (-not (Test-Path $incrementalDir)) { New-Item -ItemType Directory -Path $incrementalDir -Force | Out-Null } + $incrementalFile = Join-Path $incrementalDir "Part$($pt.Index)_${global:ScriptRunTimestamp}_qid-recovery_$($logArray.Count)records.jsonl" + $logArray | ForEach-Object { $_ | ConvertTo-Json -Depth 10 -Compress } | Out-File -FilePath $incrementalFile -Encoding utf8 -Force + + # Mark complete + $script:partitionStatus[$pt.Index].Status = 'Complete' + $script:partitionStatus[$pt.Index].RecordCount = $logArray.Count + $recoveredCount++ + + Write-LogHost " [RECOVERED] Partition $($pt.Index): $($logArray.Count) records" -ForegroundColor Green + + # Update metrics + $script:metrics.TotalRecordsFetched += $logArray.Count + } else { + # Zero records is valid - mark complete + $script:partitionStatus[$pt.Index].Status = 'Complete' + $script:partitionStatus[$pt.Index].RecordCount = 0 + $recoveredCount++ + Write-LogHost " [RECOVERED] Partition $($pt.Index): 0 records (empty time window)" -ForegroundColor Green + } + + # Save checkpoint after each successful recovery + if ($script:CheckpointEnabled) { + Save-Checkpoint -PartitionIndex $pt.Index -State 'Completed' -RecordCount ($logArray.Count) + } + } catch { + $script:partitionStatus[$pt.Index].Status = 'Failed' + $script:partitionStatus[$pt.Index].LastError = $_.Exception.Message + Write-LogHost " [FAILED] Partition $($pt.Index): $($_.Exception.Message)" -ForegroundColor Red + } + } + } + + # Final status report + $finalIncomplete = @($script:partitionStatus.Values | Where-Object { + $_.Status -notin @('Complete', 'Subdivided') + }) + + if ($finalIncomplete.Count -gt 0) { + Write-LogHost "" -ForegroundColor Red + Write-LogHost "============================================================" -ForegroundColor Red + Write-LogHost "[FINAL-RECONCILE] WARNING: $($finalIncomplete.Count) partition(s) could not be recovered after $maxFinalAttempts attempts" -ForegroundColor Red + Write-LogHost "============================================================" -ForegroundColor Red + foreach ($failed in $finalIncomplete) { + Write-LogHost " Partition $($failed.Index): $($failed.LastError)" -ForegroundColor Red + } + Write-LogHost " These partitions will be missing from the export. Use -Resume to retry later." -ForegroundColor Yellow + } else { + Write-LogHost "[FINAL-RECONCILE] Complete: Recovered $recoveredCount partition(s) via final safety net" -ForegroundColor Green + } + Write-LogHost "" -ForegroundColor White + } + } + + # MEMORY FLUSH MODE: If we flushed $allLogs during fetch, enable streaming merge from JSONL files + # This path is only for non-explosion mode (explosion is excluded from memory flush feature) + if ($script:memoryFlushed) { + Write-LogHost " [MEMORY] Memory flush occurred during fetch - enabling streaming export from JSONL files" -ForegroundColor Yellow + $script:UseStreamingMergeForExport = $true + $script:StreamingMergeDirectory = Split-Path $script:PartialOutputPath -Parent + + # Get all completed partition indices from this run for streaming merge + # Note: partitionStatus values are hashtables with a 'Partition' key containing the partition object; + # the Index property lives on the partition object, not on the status hashtable itself + $completedPartitions = @($script:partitionStatus.Values | Where-Object { $_.Status -eq 'Complete' } | ForEach-Object { $_.Partition.Index }) + $script:StreamingMergePartitions = $completedPartitions + + # Count records from partition status for accurate totals + $estimatedFromJSONL = ($completedPartitions | ForEach-Object { [int]($script:partitionStatus[$_].RecordCount ?? 0) } | Measure-Object -Sum).Sum + $script:StreamingMergeRecordCount = $estimatedFromJSONL + Write-LogHost " [MEMORY] Found $($estimatedFromJSONL.ToString('N0')) records across $($completedPartitions.Count) partitions for streaming export" -ForegroundColor DarkCyan + } + + # MERGE INCREMENTAL SAVES: Only needed for Resume mode to recover data from skipped partitions + # Fresh runs already have all records in $allLogs (added when each partition completed) + # In resume mode, only merge data for partitions that were SKIPPED (already completed before this run) + # Partitions completed during THIS run already have their data in $allLogs via AddRange + if ($script:IsResumeMode) { + $partitionsToMerge = $script:OriginallySkippedPartitionIndices + if ($partitionsToMerge -and $partitionsToMerge.Count -gt 0) { + Write-LogHost " [MERGE] Merging incremental data for previously-completed partitions: $($partitionsToMerge -join ', ')" -ForegroundColor Cyan + + # Use streaming merge when we have many partitions OR large record counts to avoid memory exhaustion + # Threshold: If more than 20 partitions to merge, or estimated >500K records, use streaming + $estimatedRecords = 0 + $incrementalDir = Join-Path (Split-Path $script:PartialOutputPath -Parent) ".pax_incremental" + # Primary: derive record count from checkpoint data — checkpoint stores exact per-partition recordCount + # set at completion time, and is not affected by multi-file (snapshot/append) inflation or stale + # files from prior runs that may still exist in the .pax_incremental folder. + if ($script:CheckpointData -and $script:CheckpointData.partitions.completed) { + $estimatedRecords = [int](($script:CheckpointData.partitions.completed | + Where-Object { [int]$_.index -in $partitionsToMerge } | + ForEach-Object { [int]$_.records } | + Measure-Object -Sum).Sum) + } + # Fallback: estimate from JSONL filenames if checkpoint data is unavailable or zero. + # Two filters applied to avoid inflation: + # (1) Current run's timestamp only — excludes stale files from prior runs in the same folder. + # (2) No snapshot files — snapshots are cumulative in-progress saves (subset of final file); + # counting them alongside the final file would double-count those records. + if ($estimatedRecords -eq 0 -and (Test-Path $incrementalDir)) { + $filesToMerge = Get-ChildItem -Path $incrementalDir -Filter "*.jsonl" -ErrorAction SilentlyContinue | Where-Object { + $partMatch = [regex]::Match($_.Name, '^Part(\d+)_') + $partMatch.Success -and ([int]$partMatch.Groups[1].Value -in $partitionsToMerge) -and ($_.Name -like "*${global:ScriptRunTimestamp}*") + } + foreach ($f in $filesToMerge) { + if ($f.Name -notmatch '_snapshot_page\d+_' -and $f.Name -match '_(\d+)records\.jsonl$') { + $estimatedRecords += [int]$Matches[1] + } + } + } + + $useStreamingMerge = ($partitionsToMerge.Count -gt 20) -or ($estimatedRecords -gt 500000) -or ($allLogs.Count -eq 0) + + # Disable streaming merge for explosion modes - explosion requires in-memory processing + # Streaming merge only works with the non-explosion fast path (direct 1:1 CSV export) + $isExplosionMode = ($ExplodeDeep -or $ExplodeArrays -or $ForcedRawInputCsvExplosion) + if ($useStreamingMerge -and $isExplosionMode) { + Write-LogHost " [MERGE] Explosion mode active - loading records into memory (streaming merge not supported with explosion)" -ForegroundColor DarkYellow + $useStreamingMerge = $false + } + + if ($useStreamingMerge) { + $streamingReason = if ($allLogs.Count -eq 0) { "all partitions from prior run" } elseif ($partitionsToMerge.Count -gt 20) { "$($partitionsToMerge.Count) partitions" } else { "~$($estimatedRecords.ToString('N0')) records" } + Write-LogHost " [MERGE] Large merge detected ($streamingReason) - using streaming mode" -ForegroundColor Yellow + Write-LogHost " [MERGE] Streaming merge avoids memory exhaustion for large datasets" -ForegroundColor DarkGray + + # Flag that we're using streaming - will need special handling for CSV export + $script:UseStreamingMergeForExport = $true + $script:StreamingMergePartitions = $partitionsToMerge + $script:StreamingMergeDirectory = Split-Path $script:PartialOutputPath -Parent + + # Don't merge into $allLogs - we'll stream directly to CSV later + # Just count the records for metrics + $mergedFromIncremental = $estimatedRecords + Write-LogHost " [MERGE] Deferred streaming merge: $($mergedFromIncremental.ToString('N0')) records will be streamed during export" -ForegroundColor DarkGray + } elseif ($isExplosionMode -and $allLogs.Count -eq 0) { + # Special case: Explosion mode with all partitions from prior run + # Need to load JSONL directly into $allLogs (Merge-IncrementalSaves requires non-empty collection) + $script:UseStreamingMergeForExport = $false + Write-LogHost " [MERGE] Loading $($estimatedRecords.ToString('N0')) records from prior run into memory for explosion..." -ForegroundColor Cyan + $incrementalDir = Join-Path (Split-Path $script:PartialOutputPath -Parent) ".pax_incremental" + $mergedFromIncremental = 0 + if (Test-Path $incrementalDir) { + $filesToLoad = Get-ChildItem -Path $incrementalDir -Filter "*.jsonl" -ErrorAction SilentlyContinue | Where-Object { + $partMatch = [regex]::Match($_.Name, '^Part(\d+)_') + $partMatch.Success -and ([int]$partMatch.Groups[1].Value -in $partitionsToMerge) + } + foreach ($file in $filesToLoad) { + try { + $lines = Get-Content -Path $file.FullName -Encoding utf8 + foreach ($line in $lines) { + if (-not [string]::IsNullOrWhiteSpace($line)) { + try { + $record = $line | ConvertFrom-Json + [void]$allLogs.Add($record) + $mergedFromIncremental++ + } catch {} + } + } + } catch { + Write-LogHost " [WARN] Failed to load $($file.Name): $($_.Exception.Message)" -ForegroundColor Yellow + } + } + } + Write-LogHost " [MERGE] Loaded $($mergedFromIncremental.ToString('N0')) records into memory" -ForegroundColor Green + } else { + # Small merge - use original in-memory approach (faster for small datasets) + $script:UseStreamingMergeForExport = $false + $mergedFromIncremental = Merge-IncrementalSaves -AllLogs $allLogs -OutputDirectory (Split-Path $script:PartialOutputPath -Parent) -CleanupAfterMerge $false -OnlyPartitionIndices $partitionsToMerge + } + + # Update TotalRecordsFetched to include merged records (these were "fetched" in a previous run) + if ($mergedFromIncremental -gt 0) { + $script:metrics.TotalRecordsFetched += $mergedFromIncremental + Write-LogHost " [MERGE] Updated record count: +$mergedFromIncremental from previous run" -ForegroundColor DarkGray + } + } else { + $mergedFromIncremental = 0 + $script:UseStreamingMergeForExport = $false + } + # Note: Incremental files are retained until successful script completion for data safety + # Cleanup happens at end of script (before exit) to allow recovery if explosion/export fails + } else { + $mergedFromIncremental = 0 + # Preserve UseStreamingMergeForExport if memory flush already set it (fresh run with JSONL-only data) + if (-not $script:memoryFlushed) { + $script:UseStreamingMergeForExport = $false + } + # Note: Incremental files are retained until successful script completion for data safety + } + + # ============================================================================ + # ZERO-RECORD RECOVERY: If all partitions "completed" but returned 0 records, + # re-check QueryIds directly against Purview - the queries may still have data + # This handles machine sleep/suspension where thread jobs died but queries succeeded + # ============================================================================ + $effectiveRecordCount = $allLogs.Count + if ($script:UseStreamingMergeForExport) { + $effectiveRecordCount += $(if ($script:StreamingMergeRecordCount -gt 0) { $script:StreamingMergeRecordCount } else { $mergedFromIncremental }) + } + + if ($effectiveRecordCount -eq 0 -and $script:partitionStatus -and $script:partitionStatus.Count -gt 0 -and -not $UseEOM) { + # Find partitions with valid QueryIds that reported 0 records + # Include both 'Complete' and 'Failed' partitions - queries persist 30 days on Purview + # regardless of thread job outcome, so data may still be recoverable + $partitionsWithQueryIds = @($script:partitionStatus.Values | Where-Object { + $_.QueryId -and $_.Status -in @('Complete','Failed') -and ($_.RecordCount -eq 0 -or $null -eq $_.RecordCount) + }) + + if ($partitionsWithQueryIds.Count -gt 0) { + Write-LogHost "" -ForegroundColor Yellow + Write-LogHost "============================================================" -ForegroundColor Yellow + Write-LogHost "[ZERO-RECORD-RECOVERY] All partitions returned 0 records but have valid QueryIds" -ForegroundColor Yellow + Write-LogHost "[ZERO-RECORD-RECOVERY] Re-checking queries directly against Purview..." -ForegroundColor Yellow + Write-LogHost "============================================================" -ForegroundColor Yellow + + # Refresh token first (critical after potential machine sleep) + $refreshResult = Refresh-GraphTokenIfNeeded -BufferMinutes 5 + if ($refreshResult -is [string] -and $refreshResult -eq 'Quit') { + if ($script:CheckpointEnabled) { Save-Checkpoint -Force } + Show-CheckpointExitMessage + exit 0 + } + + $recoveredRecords = 0 + $queriesChecked = 0 + $queriesWithData = 0 + + foreach ($partitionStatus in $partitionsWithQueryIds) { + $queryId = $partitionStatus.QueryId + $partitionIndex = $partitionStatus.Partition.Index + $partitionTotal = $partitionStatus.Partition.Total + $queriesChecked++ + + Write-LogHost " [CHECK] Partition $partitionIndex/$partitionTotal - QueryId: $queryId" -ForegroundColor Cyan + + try { + # Check query status directly + $status = Get-GraphAuditQueryStatus -QueryId $queryId + + if ($status -and $status.Status -eq 'succeeded' -and $status.RecordCount -gt 0) { + $queriesWithData++ + Write-LogHost " [FOUND] Query has $($status.RecordCount) records - fetching..." -ForegroundColor Green + + # Fetch the records + $rawRecords = Get-GraphAuditRecords -QueryId $queryId + + if ($rawRecords -and $rawRecords.Count -gt 0) { + # Normalize to EOM-compatible format + $normalizedRecords = ConvertFrom-GraphAuditRecord -GraphRecords $rawRecords + + # Add to allLogs + foreach ($rec in $normalizedRecords) { + [void]$allLogs.Add($rec) + } + + # Update partition status + $script:partitionStatus[$partitionIndex].RecordCount = $normalizedRecords.Count + $recoveredRecords += $normalizedRecords.Count + + # Save to incremental for checkpoint safety + $incrementalDir = Join-Path (Split-Path $script:PartialOutputPath -Parent) ".pax_incremental" + if (-not (Test-Path $incrementalDir)) { New-Item -ItemType Directory -Path $incrementalDir -Force | Out-Null } + $incrementalFile = Join-Path $incrementalDir "Part${partitionIndex}_${global:ScriptRunTimestamp}_qid-recovery_$($normalizedRecords.Count)records.jsonl" + $normalizedRecords | ForEach-Object { $_ | ConvertTo-Json -Depth 10 -Compress } | Out-File -FilePath $incrementalFile -Encoding utf8 -Force + + Write-LogHost " [RECOVERED] $($normalizedRecords.Count) records from partition $partitionIndex" -ForegroundColor Green + + # Update metrics + $script:metrics.TotalRecordsFetched += $normalizedRecords.Count + } + } elseif ($status) { + Write-LogHost " [EMPTY] Query status: $($status.Status), RecordCount: $($status.RecordCount)" -ForegroundColor DarkGray + } else { + Write-LogHost " [WARN] Could not retrieve query status" -ForegroundColor Yellow + } + } catch { + Write-LogHost " [ERROR] Failed to check/fetch QueryId $queryId : $($_.Exception.Message)" -ForegroundColor Red + } + } + + # Summary + Write-LogHost "" -ForegroundColor White + if ($recoveredRecords -gt 0) { + Write-LogHost "[ZERO-RECORD-RECOVERY] SUCCESS: Recovered $($recoveredRecords.ToString('N0')) records from $queriesWithData query(ies)" -ForegroundColor Green + Write-LogHost "[ZERO-RECORD-RECOVERY] This typically indicates machine sleep/suspension caused thread job failures" -ForegroundColor Yellow + Write-LogHost "[ZERO-RECORD-RECOVERY] while the Purview queries completed successfully server-side." -ForegroundColor Yellow + } else { + Write-LogHost "[ZERO-RECORD-RECOVERY] No additional records found - date range may genuinely be empty" -ForegroundColor DarkGray + } + Write-LogHost "" -ForegroundColor White + } + } + + Set-ProgressPhase -Phase 'Explosion' -Status 'Analyzing and exploding records' + Write-LogHost ""; Write-LogHost "=== Enterprise Processing Summary ===" -ForegroundColor Green + if ($script:UseStreamingMergeForExport -and $mergedFromIncremental -gt 0) { + Write-LogHost "Records retrieved this run: $($allLogs.Count)" -ForegroundColor Cyan + Write-LogHost "Records from prior run (streaming): $($mergedFromIncremental.ToString('N0'))" -ForegroundColor Cyan + Write-LogHost "Total records for export: $(($allLogs.Count + $mergedFromIncremental).ToString('N0'))" -ForegroundColor Green + } elseif ($script:UseStreamingMergeForExport -and $script:StreamingMergeRecordCount -gt 0) { + # Fresh run with memory flush - records are in JSONL, not $allLogs + Write-LogHost "Total audit records retrieved: $($script:StreamingMergeRecordCount.ToString('N0')) (streaming from JSONL)" -ForegroundColor Cyan + } else { + Write-LogHost "Total audit records retrieved: $($allLogs.Count)" -ForegroundColor Cyan + } + + # Deduplicate by RecordId to handle session pagination retry scenarios + # Bug context: When session pagination retries with a new SessionId after transient failures, + # Search-UnifiedAuditLog can return the same records again, causing duplicates in $allLogs. + # This deduplication ensures each unique audit record appears only once in the final output. + $preDedupeCount = $allLogs.Count + if ($preDedupeCount -gt 0) { + try { + Write-LogHost "Running deduplication check on $preDedupeCount records..." -ForegroundColor DarkGray + + # Deduplicate records by unique ID + # Live queries use 'Identity' property, CSV exports use 'Id' property + $uniqueLogs = New-Object System.Collections.ArrayList + $seenIds = New-Object System.Collections.Generic.HashSet[string] + $duplicateSkipped = 0 + + foreach ($log in $allLogs) { + $recordId = $null + try { + # Check properties in order of likelihood (Identity for live queries, Id for CSV) + if ($log.Identity) { $recordId = $log.Identity } + elseif ($log.Id) { $recordId = $log.Id } + elseif ($log.RecordId) { $recordId = $log.RecordId } + } catch {} + + if ($recordId -and -not $seenIds.Contains($recordId)) { + [void]$seenIds.Add($recordId) + [void]$uniqueLogs.Add($log) + } + elseif (-not $recordId) { + # Preserve records without an Id (shouldn't happen, but be safe) + [void]$uniqueLogs.Add($log) + } + else { + # Duplicate ID detected - skip this record + $duplicateSkipped++ + } + } + + # Report deduplication results + $duplicatesRemoved = $preDedupeCount - $uniqueLogs.Count + if ($duplicatesRemoved -gt 0) { + Write-LogHost "Deduplication: Removed $duplicatesRemoved duplicate record(s) (pagination retry artifacts)" -ForegroundColor Yellow + $allLogs = $uniqueLogs + } + else { + Write-LogHost "Deduplication: No duplicates found" -ForegroundColor DarkGray + } + } + catch { + Write-LogHost "Warning: Deduplication failed: $($_.Exception.Message) - proceeding with original records" -ForegroundColor DarkYellow + } + } + + # Client-side date-range trimming — remove records returned outside the requested date boundaries. + # Purview's partition-based indexing can bleed records up to ~10 hours past EndDate on large tenants. + if (($script:TrimStartDateUTC -or $script:TrimEndDateUTC) -and $allLogs.Count -gt 0) { + $preTrimCount = $allLogs.Count + $trimmedLogs = New-Object System.Collections.ArrayList($preTrimCount) + foreach ($log in $allLogs) { + $cd = script:Parse-DateSafe $log.CreationDate + if (-not $cd) { [void]$trimmedLogs.Add($log); continue } # Keep records with unparseable dates + $cdUtc = $cd.ToUniversalTime() + if ($script:TrimStartDateUTC -and $cdUtc -lt $script:TrimStartDateUTC) { continue } + if ($script:TrimEndDateUTC -and $cdUtc -ge $script:TrimEndDateUTC) { continue } + [void]$trimmedLogs.Add($log) + } + $trimCount = $preTrimCount - $trimmedLogs.Count + if ($trimCount -gt 0) { + $script:DateTrimCount += $trimCount + $allLogs = $trimmedLogs + Write-LogHost "Date-range trim: Removed $trimCount record(s) outside requested date boundaries" -ForegroundColor Yellow + } + } + + # Show accurate record count — in streaming mode allLogs may be empty because records went JSONL→CSV directly + if ($script:UseStreamingMergeForExport) { + # StreamingMergeRecordCount = memory flush fresh run; mergedFromIncremental = deferred resume merge + $streamCount = if ($script:StreamingMergeRecordCount -gt 0) { $script:StreamingMergeRecordCount } else { $mergedFromIncremental } + $effectiveTotal = $allLogs.Count + $streamCount + Write-LogHost "Records for export: $($effectiveTotal.ToString('N0')) (cross-partition deduplication occurs during streaming merge)" -ForegroundColor Cyan + } else { + Write-LogHost "Unique audit records: $($allLogs.Count)" -ForegroundColor Cyan + } + if ($script:Hit10KLimit -or $script:Hit1MLimit) { + Write-LogHost ""; + $limitType = if ($script:Hit1MLimit) { "1M (Graph API)" } else { "10K (EOM)" } + Write-LogHost " CRITICAL NOTICE: $limitType record limit was reached during processing!" -ForegroundColor Red + + try { Write-LogHost ("Structured rows produced: {0}" -f $structuredDataCount) -ForegroundColor DarkGray } catch {} + try { Write-LogHost ("Metrics.TotalStructuredRows: {0}" -f $script:metrics.TotalStructuredRows) -ForegroundColor DarkGray } catch {} + # Show subdivision summary if adaptive subdivision occurred + if ($script:SubdividedPartitions.Count -gt 0) { + Write-LogHost ""; + Write-LogHost " ADAPTIVE SUBDIVISION SUMMARY:" -ForegroundColor Yellow + Write-LogHost " Total partitions subdivided: $($script:SubdividedPartitions.Count)" -ForegroundColor Yellow + foreach ($key in $script:SubdividedPartitions.Keys) { + $count = $script:SubdividedPartitions[$key] + Write-LogHost " - Time range: $key (preview count: $count)" -ForegroundColor DarkYellow + } + Write-LogHost " Subdivision successfully prevented data loss by splitting high-volume partitions" -ForegroundColor Green + } + } + if ($allLogs.Count -eq 0 -and $RAWInputCSV) { + try { + $rehydrated = Import-Csv -Path $RAWInputCSV + $allLogs = New-Object System.Collections.ArrayList + foreach ($row in $rehydrated) { + try { + $creation = if ($row.CreationDate) { script:Parse-DateSafe $row.CreationDate } else { $null } + $identity = if ($row.Id) { $row.Id } elseif ($row.RecordId) { $row.RecordId } else { [guid]::NewGuid().ToString() } + $rec = [pscustomobject]@{ + RecordType = $(try { [int]$row.RecordType } catch { 0 }) + CreationDate = $(if ($creation) { $creation } else { Get-Date }) + UserIds = @(if ($row.UserIds) { $row.UserIds } elseif ($row.UserId) { $row.UserId } else { $null }) + Operations = if ($row.Operations) { $row.Operations } elseif ($row.Operation) { $row.Operation } else { $null } + ResultStatus = $(try { $row.ResultStatus } catch { '' }) + ResultCount = 0 + Identity = $identity + IsValid = $true + ObjectState = '' + AuditData = $row.AuditData + Operation = if ($row.Operation) { $row.Operation } elseif ($row.Operations) { $row.Operations } else { $null } + UserId = if ($row.UserId) { $row.UserId } elseif ($row.UserIds) { $row.UserIds } else { $null } + } + [void]$allLogs.Add($rec) + } catch {} + } + } + catch { } + } + # For OnlyUserInfo mode, skip the early return - we don't need audit logs, just Entra data + # For streaming merge mode, $allLogs is intentionally empty - data will be streamed from JSONL files + if ($allLogs.Count -eq 0 -and -not $OnlyUserInfo -and -not $script:UseStreamingMergeForExport) { + Write-LogHost ""; Write-LogHost "No audit logs found in the specified date range for the selected activity types." -ForegroundColor Yellow + # Warn if token refresh occurred during monitoring — 0 records may indicate auth-related data loss + if ($script:SharedAuthState.RefreshCount -gt 0) { + Write-LogHost "" + Write-LogHost " [!] WARNING: Token refresh occurred during this run ($($script:SharedAuthState.RefreshCount) refresh(es))." -ForegroundColor Red + Write-LogHost " Zero records with auth recovery suggests credentials may have expired" -ForegroundColor Red + Write-LogHost " while queries were being polled. Check the log file for 401 errors." -ForegroundColor Red + Write-LogHost " If the machine was suspended/sleeping, this is the likely cause." -ForegroundColor Red + Write-LogHost " Recommendation: Re-run the script on a machine that will not sleep." -ForegroundColor Yellow + Write-LogHost "" + } + Write-LogHost "Emitting header-only CSV (0 rows) for deterministic downstream processing..." -ForegroundColor Cyan + $headerColumns = if ($ExplodeDeep -or $ExplodeArrays -or $ForcedRawInputCsvExplosion) { if ($IncludeM365Usage -and $RAWInputCSV) { Get-M365UsageWideHeader -RawCsvPath $RAWInputCSV -BaseHeader $M365UsageBaseHeader } else { $PurviewExplodedHeader } } else { @('RecordId', 'CreationDate', 'RecordType', 'Operation', 'UserId', 'AuditData', 'AssociatedAdminUnits', 'AssociatedAdminUnitsNames') } + try { $outputDirEmpty = Split-Path $OutputFile -Parent; if (-not (Test-Path $outputDirEmpty)) { New-Item -ItemType Directory -Path $outputDirEmpty -Force | Out-Null }; $enc = New-Object System.Text.UTF8Encoding($false); $sw = [System.IO.StreamWriter]::new($OutputFile, $false, $enc); $escapedCols = @(); foreach ($col in $headerColumns) { $c = [string]$col; $needsQuote = ($c -match '[",\r\n]') -or $c.StartsWith(' ') -or $c.EndsWith(' '); $escaped = $c -replace '"', '""'; if ($needsQuote) { $escaped = '"' + $escaped + '"' }; $escapedCols += , $escaped }; $sw.WriteLine(($escapedCols -join ',')); $sw.Flush(); $sw.Dispose() } catch { Write-LogHost "Failed to write header-only CSV: $($_.Exception.Message)" -ForegroundColor Red } + # Finalize checkpoint: rename _PARTIAL files and delete checkpoint (same pattern as normal completion) + if ($script:CheckpointEnabled -and $script:PartialOutputPath -and (Test-Path $script:PartialOutputPath)) { + Complete-CheckpointRun -FinalOutputPath $script:FinalOutputPath + $OutputFile = $script:FinalOutputPath + $LogFile = $script:LogFile + } + $script:metrics.TotalStructuredRows = 0; $script:metrics.EffectiveChunkSize = 0; Set-ProgressPhase -Phase 'Complete' -Status 'No data'; Complete-Progress; Write-LogHost "Header-only CSV created at: $OutputFile" -ForegroundColor Green; $script:ScriptCompleted = $true; return + } + + # Determine explosion mode: + # - Full Explosion: ExplodeDeep, ExplodeArrays, or ForcedRawInputCsvExplosion + # - No Explosion: Standard 1:1 format + $fullExplode = ($ExplodeDeep -or $ExplodeArrays -or $ForcedRawInputCsvExplosion) + $partialExplode = $false + $effectiveExplode = $fullExplode + + $processingMode = if ($ExplodeDeep) { "deep column flattening (with row explosion)" } elseif ($fullExplode) { "array explosion (full)" } else { "standard 1:1 format" } + Write-LogHost "Converting audit records to structured format using $processingMode..." -ForegroundColor Yellow + $structuredDataCount = 0 + + # Parallel explosion configuration (PS7+ only, controlled by -ExplosionThreads) + # ExplosionThreads: 0=auto (optimized based on CPU, memory), 1=force serial, 2-32=explicit + $actualExplosionThreads = if ($ExplosionThreads -eq 0) { + # Auto-optimization: explosion is memory-bound, not CPU-bound + # 8 threads provides good parallelism without excessive GC pressure + $baseThreads = [Math]::Max(2, [Math]::Min([Environment]::ProcessorCount, 8)) + + # Memory pressure adjustment: reduce threads if system is under memory pressure + $memoryAdjustedThreads = $baseThreads + try { + $workingSetMB = [int]([System.Diagnostics.Process]::GetCurrentProcess().WorkingSet64 / 1MB) + $availableMemory = $null + try { + $osInfo = Get-CimInstance -ClassName Win32_OperatingSystem -ErrorAction SilentlyContinue + if ($osInfo) { $availableMemory = [int]($osInfo.FreePhysicalMemory / 1KB) } # Convert KB to MB + } catch { } + + if ($availableMemory -and $availableMemory -lt 2000) { + # Less than 2GB free: reduce to 2-4 threads + $memoryAdjustedThreads = [Math]::Max(2, [Math]::Min(4, $baseThreads)) + Write-LogHost " [THREADS] Memory pressure detected ($availableMemory MB free) - reducing threads: $baseThreads -> $memoryAdjustedThreads" -ForegroundColor Yellow + } + elseif ($availableMemory -and $availableMemory -lt 4000) { + # Less than 4GB free: reduce by 25% + $memoryAdjustedThreads = [Math]::Max(2, [int]($baseThreads * 0.75)) + if ($memoryAdjustedThreads -lt $baseThreads) { + Write-LogHost " [THREADS] Low memory ($availableMemory MB free) - reducing threads: $baseThreads -> $memoryAdjustedThreads" -ForegroundColor Yellow + } + } + } catch { } + + $memoryAdjustedThreads + } elseif ($ExplosionThreads -eq 1) { + 1 # Force serial + } else { + $ExplosionThreads # User-specified value (already validated 2-32) + } + + # Compute effective record count accounting for streaming merge (where $allLogs is empty, records are in JSONL) + $effectiveRecordCount = if ($script:UseStreamingMergeForExport) { + $sc = if ($script:StreamingMergeRecordCount -gt 0) { $script:StreamingMergeRecordCount } elseif ($mergedFromIncremental -gt 0) { $mergedFromIncremental } else { 0 } + $allLogs.Count + $sc + } else { + $allLogs.Count + } + + # Enable parallel only if PS7+, more than 500 records, AND threads > 1 + $useParallelExplosion = $script:IsPS7 -and ($effectiveRecordCount -gt 500) -and ($actualExplosionThreads -gt 1) + $parallelBatchSize = 1000 # Records per parallel batch + $parallelThrottleLimit = $actualExplosionThreads + + if ($useParallelExplosion) { + Write-LogHost "Parallel processing: ENABLED (PS7+ detected, $parallelThrottleLimit threads)" -ForegroundColor Green + } else { + if (-not $script:IsPS7) { + Write-LogHost "Parallel processing: DISABLED (requires PowerShell 7+)" -ForegroundColor Gray + } elseif ($ExplosionThreads -eq 1) { + Write-LogHost "Parallel processing: DISABLED (forced serial via -ExplosionThreads 1)" -ForegroundColor Gray + } elseif ($effectiveRecordCount -le 500) { + Write-LogHost "Parallel processing: DISABLED (only $effectiveRecordCount records - threshold is 500)" -ForegroundColor Gray + } + } + + # Schema discovery note: Parallel mode scans ALL rows for 100% column coverage; serial mode uses sampling + if ($useParallelExplosion) { + Write-LogHost "Streaming export mode enabled (parallel mode: full schema scan; base chunk size=$StreamingChunkSize)" -ForegroundColor Yellow + } else { + Write-LogHost "Streaming export mode enabled (schema sample=$StreamingSchemaSample; base chunk size=$StreamingChunkSize)" -ForegroundColor Yellow + } + $te0 = Get-Date + $schemaFrozen = $false; $schemaSampleRows = New-Object System.Collections.Generic.List[object]; $postFreezeNewColumns = 0; $lateIgnoredColumns = New-Object System.Collections.Generic.HashSet[string]; $columnOrder = $null; $buffer = New-Object System.Collections.Generic.List[object]; $exportTemp = Join-Path ([System.IO.Path]::GetTempPath()) ("pax_export_" + [guid]::NewGuid().ToString() + ".tmp"); $csvWriter = $false + + # Ensure TotalRecordsFetched reflects allLogs count (may not have been set in all code paths) + if ($script:metrics.TotalRecordsFetched -eq 0 -and $allLogs.Count -gt 0) { + $script:metrics.TotalRecordsFetched = $allLogs.Count + } + + # ═══════════════════════════════════════════════════════════════════════════════ + # NON-EXPLOSION FAST PATH: Direct stream to CSV with fixed schema (skip parallel overhead) + # ═══════════════════════════════════════════════════════════════════════════════ + if (-not $fullExplode) { + + # STREAMING MERGE PATH - Handle large resume scenarios without loading into memory + if ($script:UseStreamingMergeForExport) { + Write-LogHost "Non-explosion fast path: STREAMING MERGE MODE (memory-efficient)" -ForegroundColor Cyan + Write-LogHost " Streaming directly from incremental files to avoid memory exhaustion..." -ForegroundColor DarkGray + $fastPathStart = Get-Date + $fastPathColumns = @('RecordId', 'CreationDate', 'RecordType', 'Operation', 'UserId', 'AuditData', 'AssociatedAdminUnits', 'AssociatedAdminUnitsNames') + + # First, write any in-memory records from THIS run's partitions (if any) + $inMemoryCount = $allLogs.Count + $streamingActivityCounts = @{} + if ($inMemoryCount -gt 0) { + Write-LogHost " Writing $($inMemoryCount.ToString('N0')) in-memory records from current run..." -ForegroundColor DarkCyan + Open-CsvWriter -Path $exportTemp -Columns $fastPathColumns + $csvWriter = $true + + $batch = New-Object System.Collections.Generic.List[object] + $inMemoryRecordIds = New-Object System.Collections.Generic.HashSet[string] + $batchSize = 5000 + foreach ($log in $allLogs) { + $auditData = $log.AuditData + $parsedAudit = if ($log.PSObject.Properties['_ParsedAuditData']) { $log._ParsedAuditData } else { try { $auditData | ConvertFrom-Json -ErrorAction SilentlyContinue } catch { $null } } + $opValue = if ($parsedAudit -and $parsedAudit.Operation) { $parsedAudit.Operation } else { $log.Operations } + + # Track per-activity counts for Activity Type Breakdown + if ($opValue) { + if (-not $streamingActivityCounts.ContainsKey($opValue)) { $streamingActivityCounts[$opValue] = 0 } + $streamingActivityCounts[$opValue]++ + } + + $fastRecord = [pscustomobject]@{ + RecordId = if ($log.RecordId) { $log.RecordId } elseif ($log.Identity) { $log.Identity } elseif ($log.Id) { $log.Id } elseif ($parsedAudit -and $parsedAudit.Id) { $parsedAudit.Id } else { $null } + CreationDate = if ($log.CreationDate) { $log.CreationDate.ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ss.fffZ') } else { '' } + RecordType = $log.RecordType + Operation = $opValue + UserId = if ($log.UserId) { $log.UserId } elseif ($log.UserIds) { $log.UserIds } else { '' } + AuditData = $auditData + AssociatedAdminUnits = $(try { if ($parsedAudit.AssociatedAdminUnits) { $parsedAudit.AssociatedAdminUnits } elseif ($log.AssociatedAdminUnits) { $log.AssociatedAdminUnits } else { '' } } catch { '' }) + AssociatedAdminUnitsNames = $(try { if ($parsedAudit.AssociatedAdminUnitsNames) { $parsedAudit.AssociatedAdminUnitsNames } elseif ($log.AssociatedAdminUnitsNames) { $log.AssociatedAdminUnitsNames } else { '' } } catch { '' }) + } + $batch.Add($fastRecord) + if ($fastRecord.RecordId) { [void]$inMemoryRecordIds.Add($fastRecord.RecordId) } + + if ($batch.Count -ge $batchSize) { + Write-CsvRows -Rows $batch -Columns $fastPathColumns + $batch.Clear() + } + } + if ($batch.Count -gt 0) { + Write-CsvRows -Rows $batch -Columns $fastPathColumns + $batch.Clear() + } + Close-CsvWriter + $csvWriter = $false + + # Clear in-memory collection to free RAM before streaming merge + $allLogs.Clear() + [GC]::Collect() + Write-LogHost " In-memory records written, RAM freed" -ForegroundColor DarkGray + } + + # Validate JSONL file count vs expected partition count before streaming merge + # Detect data loss early — if fewer files exist than completed partitions, warn and mark output as PARTIAL + $validationIncrDir = Join-Path $script:StreamingMergeDirectory ".pax_incremental" + $expectedPartitionCount = @($script:StreamingMergePartitions).Count + $actualJsonlFiles = @() + if (Test-Path $validationIncrDir) { + $actualJsonlFiles = @(Get-ChildItem -Path $validationIncrDir -Filter "*_${global:ScriptRunTimestamp}_*.jsonl" -ErrorAction SilentlyContinue) + } + $actualFileCount = $actualJsonlFiles.Count + if ($actualFileCount -lt $expectedPartitionCount) { + # Identify which partition numbers have files vs which are missing + $partitionsWithFiles = @($actualJsonlFiles | ForEach-Object { if ($_.Name -match '^Part(\d+)_') { [int]$Matches[1] } } | Sort-Object -Unique) + $missingPartitions = @($script:StreamingMergePartitions | Where-Object { $_ -notin $partitionsWithFiles } | Sort-Object) + Write-LogHost "" -ForegroundColor Yellow + Write-LogHost " [DATA-LOSS] WARNING: Only $actualFileCount of $expectedPartitionCount partition JSONL files found for this run" -ForegroundColor Yellow + Write-LogHost " [DATA-LOSS] Missing partition data: $($missingPartitions -join ', ')" -ForegroundColor Yellow + Write-LogHost " [DATA-LOSS] Output will be marked as PARTIAL due to incomplete data" -ForegroundColor Yellow + Write-LogHost "" -ForegroundColor Yellow + # Mark output as PARTIAL by updating the output file name + $script:StreamingMergeDataLoss = $true + $script:StreamingMergeMissingPartitions = $missingPartitions + } else { + $script:StreamingMergeDataLoss = $false + } + + # Now stream merge the previously-completed partitions directly to CSV + Write-LogHost " Streaming merge of $($script:StreamingMergePartitions.Count) previously-completed partitions..." -ForegroundColor Cyan + + # If we already wrote in-memory records, we need to append to the temp file + if ($inMemoryCount -gt 0) { + # Streaming merge needs to append to existing file - use a second temp file then combine + $streamingTemp = Join-Path ([System.IO.Path]::GetTempPath()) ("pax_streaming_" + [guid]::NewGuid().ToString() + ".tmp") + $streamedCount = Merge-IncrementalSaves-Streaming -OutputFile $streamingTemp -OutputDirectory $script:StreamingMergeDirectory -OnlyPartitionIndices $script:StreamingMergePartitions -Columns $fastPathColumns -ExcludeRecordIds $inMemoryRecordIds -ActivityCounts ([ref]$streamingActivityCounts) -RunTimestamp $global:ScriptRunTimestamp + + # Append streaming temp to main temp (skip header line from streaming file) + if ((Test-Path $streamingTemp) -and $streamedCount -gt 0) { + Write-LogHost " Combining in-memory and streamed data..." -ForegroundColor DarkGray + Get-Content $streamingTemp | Select-Object -Skip 1 | Add-Content $exportTemp + Remove-Item $streamingTemp -Force -ErrorAction SilentlyContinue + } + $totalStreamedRecords = $inMemoryCount + $streamedCount + } else { + # No in-memory records - stream directly to final temp file + $totalStreamedRecords = Merge-IncrementalSaves-Streaming -OutputFile $exportTemp -OutputDirectory $script:StreamingMergeDirectory -OnlyPartitionIndices $script:StreamingMergePartitions -Columns $fastPathColumns -ActivityCounts ([ref]$streamingActivityCounts) -RunTimestamp $global:ScriptRunTimestamp + } + + # If data loss detected, rename output to include _PARTIAL suffix so users know data is incomplete + if ($script:StreamingMergeDataLoss) { + $outDir = Split-Path $OutputFile -Parent + $outBase = [System.IO.Path]::GetFileNameWithoutExtension($OutputFile) + $outExt = [System.IO.Path]::GetExtension($OutputFile) + $OutputFile = Join-Path $outDir "${outBase}_PARTIAL${outExt}" + Write-LogHost " [DATA-LOSS] Output file renamed to: $(Split-Path $OutputFile -Leaf)" -ForegroundColor Yellow + Write-LogHost " [DATA-LOSS] Missing partitions: $($script:StreamingMergeMissingPartitions -join ', ')" -ForegroundColor Yellow + } + + # Move temp file to final output + if (Test-Path $exportTemp) { + Move-Item -Path $exportTemp -Destination $OutputFile -Force + } + + $fastPathElapsed = (Get-Date) - $fastPathStart + $fastPathRate = if ($fastPathElapsed.TotalSeconds -gt 0) { [int]($totalStreamedRecords / $fastPathElapsed.TotalSeconds) } else { 0 } + Write-LogHost "Streaming merge export complete: $($totalStreamedRecords.ToString('N0')) records in $([Math]::Round($fastPathElapsed.TotalSeconds, 1))s ($fastPathRate rec/sec)" -ForegroundColor Green + + # Update metrics + $script:metrics.TotalStructuredRows = $totalStreamedRecords + $structuredDataCount = $totalStreamedRecords + $processedRecordCount = $totalStreamedRecords + $columnOrder = $fastPathColumns + $schemaFrozen = $true + + # Update Retrieved counter to reflect actual streaming merge data + # actual = exported unique records + duplicates removed (more accurate than filename-based estimate) + $actualRetrievedFromMerge = $totalStreamedRecords + [int]$script:StreamingMergeDuplicatesSkipped + if ($actualRetrievedFromMerge -gt 0) { + $script:metrics.TotalRecordsFetched = $actualRetrievedFromMerge + } + + # Populate per-activity metrics from actual streaming counts (inline handlers don't track these reliably) + $script:metrics.Activities = @{} + foreach ($opKey in $streamingActivityCounts.Keys) { + $script:metrics.Activities[$opKey] = @{ Retrieved = $streamingActivityCounts[$opKey]; Structured = $streamingActivityCounts[$opKey] } + } + + # Store original count for ratio comparisons (allLogs was cleared for RAM) + $script:OriginalInputRecordCount = $totalStreamedRecords + + # Skip the normal fast path processing below + $script:StreamingMergeCompleted = $true + } + # END STREAMING MERGE PATH + + # Standard non-explosion fast path (original code, only runs if NOT using streaming merge) + if (-not $script:UseStreamingMergeForExport) { + Write-LogHost "Non-explosion fast path: Direct streaming with fixed 8-column schema..." -ForegroundColor Cyan + $fastPathStart = Get-Date + + # Fixed schema for non-explosion mode (column names match Purview UI export) + $fastPathColumns = @('RecordId', 'CreationDate', 'RecordType', 'Operation', 'UserId', 'AuditData', 'AssociatedAdminUnits', 'AssociatedAdminUnitsNames') + + # Open CSV with known schema immediately + Open-CsvWriter -Path $exportTemp -Columns $fastPathColumns + $csvWriter = $true + + # Stream directly - minimal transformation + $batchSize = 5000 + $batch = New-Object System.Collections.Generic.List[object] + $processedCount = 0 + $lastProgressTime = Get-Date + + # Track per-activity export counts for metrics (fast path) + $fastPathActivityCounts = @{} + + foreach ($log in $allLogs) { + $processedCount++ + + # Minimal record creation (no full Convert-ToStructuredRecord overhead) + $auditData = $log.AuditData + $parsedAudit = if ($log.PSObject.Properties['_ParsedAuditData']) { $log._ParsedAuditData } else { try { $auditData | ConvertFrom-Json -ErrorAction SilentlyContinue } catch { $null } } + $opValue = if ($parsedAudit -and $parsedAudit.Operation) { $parsedAudit.Operation } else { $log.Operations } + + # Track per-activity counts + $opKey = if ($opValue) { [string]$opValue } else { 'Unknown' } + if (-not $fastPathActivityCounts.ContainsKey($opKey)) { $fastPathActivityCounts[$opKey] = 0 } + $fastPathActivityCounts[$opKey]++ + + $fastRecord = [pscustomobject]@{ + RecordId = if ($log.RecordId) { $log.RecordId } elseif ($log.Identity) { $log.Identity } elseif ($log.Id) { $log.Id } elseif ($parsedAudit -and $parsedAudit.Id) { $parsedAudit.Id } else { $null } + CreationDate = if ($log.CreationDate) { $log.CreationDate.ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ss.fffZ') } else { '' } + RecordType = $log.RecordType + Operation = $opValue + UserId = if ($log.UserId) { $log.UserId } elseif ($log.UserIds) { $log.UserIds } else { '' } + AuditData = $auditData + AssociatedAdminUnits = $(try { if ($parsedAudit.AssociatedAdminUnits) { $parsedAudit.AssociatedAdminUnits } elseif ($log.AssociatedAdminUnits) { $log.AssociatedAdminUnits } else { '' } } catch { '' }) + AssociatedAdminUnitsNames = $(try { if ($parsedAudit.AssociatedAdminUnitsNames) { $parsedAudit.AssociatedAdminUnitsNames } elseif ($log.AssociatedAdminUnitsNames) { $log.AssociatedAdminUnitsNames } else { '' } } catch { '' }) + } + + $batch.Add($fastRecord) + + if ($batch.Count -ge $batchSize) { + Write-CsvRows -Rows $batch -Columns $fastPathColumns + $batch.Clear() + + # Progress every 60 seconds + if (((Get-Date) - $lastProgressTime).TotalSeconds -ge 60) { + $elapsed = (Get-Date) - $fastPathStart + $rate = [int]($processedCount / $elapsed.TotalSeconds) + $pct = [int](($processedCount / $allLogs.Count) * 100) + Write-LogHost ("[{0}] Fast path: {1:N0}/{2:N0} ({3}%) | ~{4:N0} rec/sec" -f (Get-Date -Format "HH:mm:ss"), $processedCount, $allLogs.Count, $pct, $rate) -ForegroundColor DarkCyan + $lastProgressTime = Get-Date + } + } + } + + # Flush remaining batch + if ($batch.Count -gt 0) { + Write-CsvRows -Rows $batch -Columns $fastPathColumns + $batch.Clear() + } + + Close-CsvWriter + $csvWriter = $false + + # Move temp file to final output + if (Test-Path $exportTemp) { + Move-Item -Path $exportTemp -Destination $OutputFile -Force + } + + $fastPathElapsed = (Get-Date) - $fastPathStart + $fastPathRate = if ($fastPathElapsed.TotalSeconds -gt 0) { [int]($allLogs.Count / $fastPathElapsed.TotalSeconds) } else { 0 } + Write-LogHost "Non-explosion fast path complete: $($allLogs.Count) records in $([Math]::Round($fastPathElapsed.TotalSeconds, 1))s ($fastPathRate rec/sec)" -ForegroundColor Green + + # Update metrics + $script:metrics.TotalStructuredRows = $allLogs.Count + $structuredDataCount = $allLogs.Count + $processedRecordCount = $allLogs.Count + $columnOrder = $fastPathColumns + $schemaFrozen = $true + + # Merge fast path activity counts into script:metrics.Activities (for Activity Type Breakdown) + # In fast path (non-explosion), Retrieved and Structured are equal (1:1 mapping) + foreach ($opKey in $fastPathActivityCounts.Keys) { + if (-not $script:metrics.Activities.ContainsKey($opKey)) { + $script:metrics.Activities[$opKey] = @{ Retrieved = 0; Structured = 0 } + } + $script:metrics.Activities[$opKey].Retrieved += $fastPathActivityCounts[$opKey] + $script:metrics.Activities[$opKey].Structured += $fastPathActivityCounts[$opKey] + } + + # Skip to post-processing (bypass parallel and serial paths) + $skipToPostProcessing = $true + } # End of standard non-explosion fast path (if not using streaming merge) + + # If streaming merge was used, also skip to post-processing + if ($script:UseStreamingMergeForExport) { + $skipToPostProcessing = $true + } + } else { + $skipToPostProcessing = $false + } + + # ═══════════════════════════════════════════════════════════════════════════════ + # PARALLEL MODE: Skip serial loop entirely - do parallel schema discovery + processing + # ═══════════════════════════════════════════════════════════════════════════════ + if (-not $skipToPostProcessing -and $useParallelExplosion) { + # Jump to parallel processing block (after the serial foreach loop) + $processedRecordCount = 0 + $schemaFrozen = $false + # Fall through to parallel block below (skip serial foreach) + } elseif (-not $skipToPostProcessing) { + # SERIAL MODE: Original foreach loop for PS5 compatibility + + # Explosion progress tracking (provides heartbeat for long-running explosion phase) + $explosionProgressInterval = 20000 + $processedRecordCount = 0 + $lastProgressUpdate = 0 + + foreach ($log in $allLogs) { + $processedRecordCount++ + + # NOTE: Retrieved counts are tracked during partition retrieval (lines 11052, 11431, 11784) + # Do NOT increment here - that would double-count records + + # Periodic progress update (every 20K records) + if ($processedRecordCount -ge ($lastProgressUpdate + $explosionProgressInterval)) { + $elapsed = (Get-Date) - $te0 + $rate = [int]($processedRecordCount / $elapsed.TotalSeconds) + $pct = [int](($processedRecordCount / $allLogs.Count) * 100) + $elapsedStr = if ($elapsed.TotalMinutes -ge 1) { + "{0}m {1}s" -f [int]$elapsed.TotalMinutes, $elapsed.Seconds + } else { + "{0}s" -f [int]$elapsed.TotalSeconds + } + Write-LogHost ("[{0}] Processing: {1:N0} / {2:N0} records ({3}%) | Elapsed: {4} | Rate: {5:N0} rec/sec" -f (Get-Date -Format "HH:mm:ss"), $processedRecordCount, $allLogs.Count, $pct, $elapsedStr, $rate) -ForegroundColor DarkCyan + $lastProgressUpdate = $processedRecordCount + } + $records = if ($effectiveExplode) { Convert-ToPurviewExplodedRecords -Record $log -Deep:$ExplodeDeep -PartialExplode:$partialExplode -PromptFilterValue $PromptFilter } else { Convert-ToStructuredRecord -Record $log -EnableExplosion:$false } + $recordsArr = To-RecordArray $records + if ($recordsArr.Count -gt 0) { + try { + $script:metrics.TotalStructuredRows += $recordsArr.Count + $structuredDataCount += $recordsArr.Count + $opName = $null; try { $opName = if ($log.Operation) { [string]$log.Operation } elseif ($log.Operations) { [string]$log.Operations } else { $null } } catch {}; if (-not $opName) { $opName = 'Unknown' }; if (-not $script:metrics.Activities.ContainsKey($opName)) { $script:metrics.Activities[$opName] = @{ Retrieved = 0; Structured = 0 } }; $script:metrics.Activities[$opName].Structured += $recordsArr.Count + } catch {} + foreach ($r in $recordsArr) { + if (-not $schemaFrozen) { + $schemaSampleRows.Add($r) | Out-Null + if ($schemaSampleRows.Count -ge $StreamingSchemaSample) { + if ($ExplodeArrays -or $ExplodeDeep -or $ForcedRawInputCsvExplosion) { + $columnOrder = New-Object System.Collections.Generic.List[string] + if ($IncludeM365Usage -and $RAWInputCSV) { + foreach ($c in (Get-M365UsageWideHeader -RawCsvPath $RAWInputCSV -BaseHeader $M365UsageBaseHeader)) { [void]$columnOrder.Add($c) } + } else { + foreach ($c in $PurviewExplodedHeader) { [void]$columnOrder.Add($c) } + } + if ($ExplodeDeep -and $script:DeepExtraColumns -and $script:DeepExtraColumns.Count -gt 0) { + foreach ($c in $script:DeepExtraColumns) { if (-not $columnOrder.Contains($c)) { [void]$columnOrder.Add($c) } } + } + # Augment header with discovered columns (live or replay) + foreach ($sr in $schemaSampleRows) { foreach ($pn in $sr.PSObject.Properties.Name) { if (-not $columnOrder.Contains($pn)) { [void]$columnOrder.Add($pn) } } } + } + else { + $columnOrder = New-Object System.Collections.Generic.List[string] + foreach ($sr in $schemaSampleRows) { foreach ($pn in $sr.PSObject.Properties.Name) { if (-not $columnOrder.Contains($pn)) { [void]$columnOrder.Add($pn) } } } + } + Write-LogHost "Schema frozen with $($columnOrder.Count) columns after $($schemaSampleRows.Count) sample rows (serial mode - sample-based)" -ForegroundColor DarkCyan + $effectiveChunkSize = $StreamingChunkSize + $colCount = $columnOrder.Count + if ($colCount -gt 1000) { $effectiveChunkSize = [int][Math]::Min($effectiveChunkSize, 1000) } + elseif ($colCount -gt 750) { $effectiveChunkSize = [int][Math]::Min($effectiveChunkSize, 1500) } + elseif ($colCount -gt 500) { $effectiveChunkSize = [int][Math]::Min($effectiveChunkSize, 2500) } + elseif ($colCount -gt 250) { $effectiveChunkSize = [int][Math]::Min($effectiveChunkSize, 4000) } + else { if ($colCount -le 60 -and $StreamingChunkSize -lt 15000) { $autoBoost = [int][Math]::Min(15000, [Math]::Max($StreamingChunkSize * 3, 8000)); $effectiveChunkSize = $autoBoost } } + if ($effectiveChunkSize -ne $StreamingChunkSize) { Write-LogHost "Adaptive chunk size applied: $effectiveChunkSize (was $StreamingChunkSize) due to column width $colCount" -ForegroundColor DarkYellow } else { Write-LogHost "Chunk size retained/boosted: $effectiveChunkSize (columns=$colCount)" -ForegroundColor DarkGray } + $script:metrics.EffectiveChunkSize = $effectiveChunkSize + if (-not $csvWriter) { Open-CsvWriter -Path $exportTemp -Columns $columnOrder; $csvWriter = $true } + $emitRows = @(); foreach ($sr in $schemaSampleRows) { $emitRows += ($sr | Select-Object -Property $columnOrder) }; if ($emitRows.Count -gt 0) { Write-CsvRows -Rows $emitRows -Columns $columnOrder } + $schemaSampleRows.Clear(); $schemaFrozen = $true + } + } + else { + $rowHadNew = $false + foreach ($pn in $r.PSObject.Properties.Name) { if (-not $columnOrder.Contains($pn)) { if (-not $rowHadNew) { $postFreezeNewColumns++; $rowHadNew = $true }; if (-not $lateIgnoredColumns.Contains($pn)) { [void]$lateIgnoredColumns.Add($pn) } } } + $buffer.Add($r) | Out-Null + if (-not $effectiveChunkSize) { $effectiveChunkSize = $StreamingChunkSize } + if ($buffer.Count -ge $effectiveChunkSize) { $emitSet = $buffer | ForEach-Object { $_ | Select-Object -Property $columnOrder }; if (-not $csvWriter) { Open-CsvWriter -Path $exportTemp -Columns $columnOrder; $csvWriter = $true }; if ($emitSet.Count -gt 0) { Write-CsvRows -Rows $emitSet -Columns $columnOrder }; $buffer.Clear() } + } + } + } + else { + if (-not $script:loggedZeroRecords) { $script:loggedZeroRecords = 0 } + if ($script:loggedZeroRecords -lt 5) { + $opName = $null; try { $opName = if ($log.Operation) { [string]$log.Operation } elseif ($log.Operations) { [string]$log.Operations } else { $null } } catch {} + Write-LogHost "Record produced 0 structured rows (Operation=$opName)" -ForegroundColor DarkYellow + $script:loggedZeroRecords++ + } + } + } + } # End of SERIAL MODE else block + + # ═══════════════════════════════════════════════════════════════════════════════ + # PARALLEL EXPLOSION PROCESSING (PS7+ only) + # Uses Start-ThreadJob with job queue pattern for optimal load balancing + # Many small chunks + N concurrent workers = better CPU utilization + # ═══════════════════════════════════════════════════════════════════════════════ + if (-not $skipToPostProcessing -and $useParallelExplosion) { + $processingMode = if ($ExplodeArrays -or $ExplodeDeep) { "array explosion" } else { "record conversion" } + Write-LogHost "Starting parallel $processingMode of $($allLogs.Count) records using $parallelThrottleLimit concurrent threads..." -ForegroundColor Cyan + + # Update checkpoint to track explosion phase + if ($script:CheckpointEnabled -and $script:CheckpointData) { + $script:CheckpointData.explosion.status = 'InProgress' + $script:CheckpointData.explosion.recordsProcessed = 0 + $script:CheckpointData.explosion.rowsGenerated = 0 + $script:CheckpointData.explosion.lastUpdateTime = (Get-Date).ToUniversalTime().ToString('o') + Save-CheckpointToDisk + } + + $parallelStartTime = Get-Date + + # ───────────────────────────────────────────────────────────────────────── + # PHASE 1: Split records into many small chunks for job queue + # Small chunks = better load balancing when work varies per record + # ───────────────────────────────────────────────────────────────────────── + $totalRecords = $allLogs.Count + # Use ~1000 records per chunk (sweet spot for overhead vs load balancing) + # Minimum chunks = 2x thread count to ensure good distribution + $targetChunkSize = 1000 + $minChunks = $parallelThrottleLimit * 2 + $chunkSize = [Math]::Min($targetChunkSize, [Math]::Ceiling($totalRecords / $minChunks)) + $chunkSize = [Math]::Max(100, $chunkSize) # At least 100 records per chunk + + $chunks = [System.Collections.Generic.List[object[]]]::new() + for ($i = 0; $i -lt $totalRecords; $i += $chunkSize) { + $endIdx = [Math]::Min($i + $chunkSize - 1, $totalRecords - 1) + $chunk = $allLogs[$i..$endIdx] + $chunks.Add($chunk) + } + + Write-LogHost "Split $totalRecords records into $($chunks.Count) chunks (~$chunkSize records each) for $parallelThrottleLimit workers" -ForegroundColor Cyan + + # ───────────────────────────────────────────────────────────────────────── + # PHASE 2: Job queue - run up to N concurrent jobs, queue the rest + # As each job completes, start another from the queue + # ───────────────────────────────────────────────────────────────────────── + Write-LogHost "Phase 1: Processing $($chunks.Count) chunks with $parallelThrottleLimit concurrent workers..." -ForegroundColor Cyan + $explosionStart = Get-Date + + # Capture parameters for thread jobs + $threadParams = @{ + ExplodeDeep = $ExplodeDeep + PartialExplode = $partialExplode + PromptFilter = $PromptFilter + EffectiveExplode = $effectiveExplode + } + + # Job queue management + $activeJobs = [System.Collections.Generic.List[object]]::new() + $completedResults = [System.Collections.Generic.List[object]]::new() + $chunkQueue = [System.Collections.Generic.Queue[object]]::new() + foreach ($chunk in $chunks) { $chunkQueue.Enqueue($chunk) } + + $totalChunks = $chunks.Count + $chunksStarted = 0 + $chunksCompleted = 0 + $failedChunks = 0 + $lastProgressTime = Get-Date + $progressInterval = [TimeSpan]::FromSeconds(60) # Update progress every 60 seconds + + # ───────────────────────────────────────────────────────────────────────── + # Build InitializationScript with all required function definitions + # Start-ThreadJob creates ISOLATED runspaces - functions are NOT inherited + # ───────────────────────────────────────────────────────────────────────── + $initScriptText = @' +# Thread-local helper functions (required for explosion) +function Test-ScalarValue { param($v) ($null -eq $v -or $v -is [string] -or $v -is [char] -or $v -is [bool] -or $v -is [int] -or $v -is [long] -or $v -is [double] -or $v -is [decimal] -or $v -is [float] -or $v -is [datetime] -or $v -is [guid]) } + +function Get-SafeProperty { param($obj, [string]$name) try { if ($null -ne $obj -and $obj.PSObject.Properties[$name]) { return $obj.($name) } } catch {}; return $null } + +function Select-FirstNonNull { param([object[]]$Values) foreach ($v in $Values) { if ($null -ne $v -and ('' -ne [string]$v)) { return $v } } return $null } + +function To-RecordArray { + param($records) + $result = @() + if ($null -eq $records) { return $result } + $isEnumerable = ($records -is [System.Collections.IEnumerable]) + $isScalarish = ($records -is [string] -or $records -is [System.Management.Automation.PSObject] -or $records -is [System.Management.Automation.PSCustomObject]) + if ($isEnumerable -and -not $isScalarish) { foreach ($r in $records) { $result += ,$r } } + else { $result += ,$records } + return $result +} + +function Find-AllArrays { + param($Data, [string]$Path = '', [int]$Depth = 0, [hashtable]$Arrays) + if ($null -eq $Data) { return @{} } + if (-not $Arrays) { $Arrays = @{} } + if ($Depth -gt 6) { return $Arrays } + $isArray = ($Data -is [System.Collections.IEnumerable] -and -not ($Data -is [string]) -and (($Data -is [System.Collections.IList]) -or $Data.GetType().IsArray)) + if ($isArray) { + $key = if ($Path) { $Path } else { 'root' } + if (-not $Arrays.ContainsKey($key)) { $Arrays[$key] = [pscustomobject]@{ Path = $Path; Data = $Data; Count = ($Data | Measure-Object).Count } } + } + $props = $null + if ($Data -is [System.Management.Automation.PSObject]) { $props = $Data.PSObject.Properties } + elseif ($Data -is [System.Collections.IDictionary]) { $props = $Data.GetEnumerator() } + if ($props) { + foreach ($p in $props) { + $name = if ($p -is [System.Collections.DictionaryEntry]) { $p.Key } else { $p.Name } + $val = if ($p -is [System.Collections.DictionaryEntry]) { $p.Value } else { $p.Value } + $childPath = if ($Path) { "$Path.$name" } else { $name } + Find-AllArrays -Data $val -Path $childPath -Depth ($Depth + 1) -Arrays $Arrays | Out-Null + } + } + return $Arrays +} + +function ConvertTo-FlatColumns { + param([object]$Node, [string]$Prefix = '', [int]$MaxDepth = 60) + $cols = @{} + function Recurse([object]$n, [string]$p, [int]$d) { + if ($d -gt $MaxDepth) { return } + if ($null -eq $n) { if ($p) { $cols[$p.TrimEnd('.')] = $null }; return } + if (Test-ScalarValue $n) { if ($p) { $cols[$p.TrimEnd('.')] = $n }; return } + if ($n -is [System.Collections.IEnumerable] -and -not ($n -is [string]) -and -not ($n -is [System.Collections.IDictionary])) { + # Smart array handling: single-element arrays recurse without index, multi-element become JSON + $arr = @($n) + if ($arr.Count -eq 1) { + Recurse -n $arr[0] -p $p -d ($d + 1) + } elseif ($arr.Count -gt 1) { + if ($p) { try { $cols[$p.TrimEnd('.')] = ($n | ConvertTo-Json -Depth 10 -Compress -ErrorAction SilentlyContinue) } catch { $cols[$p.TrimEnd('.')] = '' } } + } else { + if ($p) { $cols[$p.TrimEnd('.')] = '' } + } + return + } + $props = $null; try { $props = $n.PSObject.Properties } catch {} + if ($props) { foreach ($prop in $props) { $name = [string]$prop.Name; $child = $prop.Value; $cp = if ($p) { $p + $name + '.' } else { $name + '.' }; Recurse -n $child -p $cp -d ($d + 1) } } + } + Recurse -n $Node -p $Prefix -d 0 + return $cols +} + +function Profile-AuditData { param([object]$AuditData) } # No-op stub for threads +'@ + + # Extract Convert-ToPurviewExplodedRecords and Convert-ToStructuredRecord function definitions + $funcConvertExploded = (Get-Command Convert-ToPurviewExplodedRecords -ErrorAction SilentlyContinue).Definition + $funcConvertStructured = (Get-Command Convert-ToStructuredRecord -ErrorAction SilentlyContinue).Definition + + # Build complete initialization script with all functions + $fullInitScript = $initScriptText + "`n`n" + ` + "function Convert-ToPurviewExplodedRecords {`n$funcConvertExploded`n}`n`n" + ` + "function Convert-ToStructuredRecord {`n$funcConvertStructured`n}" + + $initScriptBlock = [scriptblock]::Create($fullInitScript) + + # Capture additional variables needed in threads + $threadVars = @{ + PurviewExplodedHeader = $PurviewExplodedHeader + FlatDepthStandard = $FlatDepthStandard + FlatDepthDeep = $FlatDepthDeep + JsonDepth = $JsonDepth + ExplosionPerRecordRowCap = $ExplosionPerRecordRowCap + } + + # Helper to start a new job + $startNextJob = { + if ($chunkQueue.Count -gt 0) { + $chunk = $chunkQueue.Dequeue() + $chunksStarted++ + $job = Start-ThreadJob -Name "PAX_Chunk_$chunksStarted" -InitializationScript $initScriptBlock -ScriptBlock { + param($Records, $Params, $Vars) + + # Initialize thread-local script-scoped variables and helpers + $script:metrics = @{ FilteringSkippedRecords = 0; FilteringMissingAuditData = 0; FilteringPromptFiltered = 0; FilteringParseFailures = 0; ExplosionEvents = 0; ExplosionRowsFromEvents = 0; ExplosionMaxPerRecord = 0; ExplosionTruncated = $false } + $script:DeepExtraColumns = New-Object System.Collections.Generic.List[string] + $script:RegexTrueFalse = [regex]::new('^(?i:true|false)$', [System.Text.RegularExpressions.RegexOptions]::Compiled) + $script:RegexYes1 = [regex]::new('^(?i:yes|1)$', [System.Text.RegularExpressions.RegexOptions]::Compiled) + $script:RegexNo0 = [regex]::new('^(?i:no|0)$', [System.Text.RegularExpressions.RegexOptions]::Compiled) + $PurviewExplodedHeader = $Vars.PurviewExplodedHeader + $FlatDepthStandard = $Vars.FlatDepthStandard + $FlatDepthDeep = $Vars.FlatDepthDeep + $JsonDepth = $Vars.JsonDepth + $ExplosionPerRecordRowCap = $Vars.ExplosionPerRecordRowCap + + # Thread-local script-scoped helper functions + # Parse-DateSafe: Culture-invariant date parsing for Purview API dates + function script:Parse-DateSafe { + param([string]$dateStr) + if ([string]::IsNullOrWhiteSpace($dateStr)) { return $null } + $dateStr = $dateStr.Trim() + # Try ISO 8601 formats first (most common from Purview) + $isoFormats = @( + 'yyyy-MM-ddTHH:mm:ss.fffffffZ', 'yyyy-MM-ddTHH:mm:ss.ffffffZ', 'yyyy-MM-ddTHH:mm:ss.fffffZ', + 'yyyy-MM-ddTHH:mm:ss.ffffZ', 'yyyy-MM-ddTHH:mm:ss.fffZ', 'yyyy-MM-ddTHH:mm:ss.ffZ', + 'yyyy-MM-ddTHH:mm:ss.fZ', 'yyyy-MM-ddTHH:mm:ssZ', 'yyyy-MM-ddTHH:mm:ss', + 'yyyy-MM-dd HH:mm:ss', 'yyyy-MM-dd' + ) + foreach ($fmt in $isoFormats) { + try { return [datetime]::ParseExact($dateStr, $fmt, [System.Globalization.CultureInfo]::InvariantCulture, [System.Globalization.DateTimeStyles]::AdjustToUniversal) } catch {} + } + # US date formats (MM/dd/yyyy as Purview returns) + $usFormats = @('M/d/yyyy h:mm:ss tt', 'M/d/yyyy HH:mm:ss', 'M/d/yyyy H:mm:ss', 'M/d/yyyy') + foreach ($fmt in $usFormats) { + try { return [datetime]::ParseExact($dateStr, $fmt, [System.Globalization.CultureInfo]::InvariantCulture) } catch {} + } + # Fallback to InvariantCulture Parse + try { return [datetime]::Parse($dateStr, [System.Globalization.CultureInfo]::InvariantCulture) } catch { return $null } + } + function script:Format-DatePurviewFast($dt) { + if (-not $dt) { return '' } + if ($dt -is [datetime]) { return $dt.ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ss.fffZ') } + $parsed = script:Parse-DateSafe $dt + if ($parsed) { return $parsed.ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ss.fffZ') } + return '' + } + function script:BoolTFFast($v) { + if ($null -eq $v) { return '' } + if ($v -is [bool]) { return $v.ToString().ToUpper() } + $vStr = [string]$v + if ($script:RegexTrueFalse.IsMatch($vStr)) { return $vStr.ToUpper() } + if ($script:RegexYes1.IsMatch($vStr)) { return 'TRUE' } + if ($script:RegexNo0.IsMatch($vStr)) { return 'FALSE' } + return $vStr + } + function script:ToJsonIfObjectFast($v) { + if ($null -eq $v) { return '' } + if (Test-ScalarValue $v) { return $v } + try { return ($v | ConvertTo-Json -Depth $JsonDepth -Compress) } catch { return [string]$v } + } + function script:GetArrayFast($parent, [string]$name) { + $val = Get-SafeProperty $parent $name + if ($null -eq $val) { return @() } + if ($val -is [System.Collections.IEnumerable] -and -not ($val -is [string])) { return @($val) } + return @($val) + } + + $results = [System.Collections.Generic.List[object]]::new() + $recordCount = 0 + + foreach ($log in $Records) { + $recordCount++ + try { + $records = if ($Params.EffectiveExplode) { + Convert-ToPurviewExplodedRecords -Record $log -Deep:$Params.ExplodeDeep -PartialExplode:$Params.PartialExplode -PromptFilterValue $Params.PromptFilter -SkipMetrics + } else { + Convert-ToStructuredRecord -Record $log -EnableExplosion:$false + } + + $recordsArr = To-RecordArray $records + if ($recordsArr.Count -gt 0) { + foreach ($r in $recordsArr) { + $results.Add($r) + } + } + } catch { + # Skip failed records + } + } + + @{ + Records = $results.ToArray() + RecordCount = $recordCount + } + } -ArgumentList @($chunk, $threadParams, $threadVars) + + $activeJobs.Add($job) + } + } + + # Start initial batch of jobs (up to thread limit) + for ($j = 0; $j -lt $parallelThrottleLimit -and $chunkQueue.Count -gt 0; $j++) { + & $startNextJob + } + + # Process job queue until all complete + while ($activeJobs.Count -gt 0 -or $chunkQueue.Count -gt 0) { + # Check for completed jobs + $justCompleted = @($activeJobs | Where-Object { $_.State -eq 'Completed' -or $_.State -eq 'Failed' }) + + foreach ($job in $justCompleted) { + $activeJobs.Remove($job) | Out-Null + $chunksCompleted++ + + if ($job.State -eq 'Completed') { + try { + $result = Receive-Job -Job $job -ErrorAction Stop + if ($result) { + $completedResults.Add($result) + } + } catch { + $failedChunks++ + } + } else { + $failedChunks++ + } + + Remove-Job -Job $job -Force -ErrorAction SilentlyContinue + + # Start next job from queue + & $startNextJob + } + + # Progress update + if (((Get-Date) - $lastProgressTime) -gt $progressInterval) { + $elapsed = (Get-Date) - $explosionStart + $elapsedStr = if ($elapsed.TotalMinutes -ge 1) { "{0}m {1}s" -f [int]$elapsed.TotalMinutes, $elapsed.Seconds } else { "{0}s" -f [int]$elapsed.TotalSeconds } + $pct = [int](($chunksCompleted / $totalChunks) * 100) + $recordsProcessed = $chunksCompleted * $chunkSize # Approximate + $rate = if ($elapsed.TotalSeconds -gt 0) { [int]($recordsProcessed / $elapsed.TotalSeconds) } else { 0 } + Write-LogHost ("[{0}] Chunks: {1}/{2} ({3}%) | Active: {4} | Queue: {5} | ~{6:N0} rec/sec | {7}" -f (Get-Date -Format "HH:mm:ss"), $chunksCompleted, $totalChunks, $pct, $activeJobs.Count, $chunkQueue.Count, $rate, $elapsedStr) -ForegroundColor DarkCyan + $lastProgressTime = Get-Date + } + + if ($activeJobs.Count -gt 0) { + Start-Sleep -Milliseconds 100 + } + } + + # Final 100% status line for Phase 1 + $elapsed = (Get-Date) - $explosionStart + $elapsedStr = if ($elapsed.TotalMinutes -ge 1) { "{0}m {1}s" -f [int]$elapsed.TotalMinutes, $elapsed.Seconds } else { "{0}s" -f [int]$elapsed.TotalSeconds } + $finalRate = if ($elapsed.TotalSeconds -gt 0) { [int]($totalRecords / $elapsed.TotalSeconds) } else { 0 } + Write-LogHost ("[{0}] Chunks: {1}/{1} (100%) | Complete | ~{2:N0} rec/sec | {3}" -f (Get-Date -Format "HH:mm:ss"), $totalChunks, $finalRate, $elapsedStr) -ForegroundColor DarkCyan + + $explosionElapsed = (Get-Date) - $explosionStart + $explosionRate = if ($explosionElapsed.TotalSeconds -gt 0) { [int]($totalRecords / $explosionElapsed.TotalSeconds) } else { 0 } + + if ($failedChunks -gt 0) { + Write-LogHost "WARNING: $failedChunks chunk(s) failed - some records may be missing" -ForegroundColor Yellow + } + + # ───────────────────────────────────────────────────────────────────────── + # PHASE 3: Collect all results into single list + # ───────────────────────────────────────────────────────────────────────── + Write-LogHost "Phase 2: Consolidating results from $($completedResults.Count) chunks..." -ForegroundColor Cyan + $allExplodedRecords = [System.Collections.Generic.List[object]]::new() + $totalSourceRecords = 0 + + foreach ($result in $completedResults) { + if ($result.Records) { + $totalSourceRecords += $result.RecordCount + foreach ($r in $result.Records) { + $allExplodedRecords.Add($r) + } + } + } + $completedResults.Clear() + + $phaseDesc = if ($ExplodeArrays -or $ExplodeDeep) { "Parallel explosion" } else { "Parallel conversion" } + Write-LogHost "$phaseDesc complete: $totalRecords records -> $($allExplodedRecords.Count) rows in $([Math]::Round($explosionElapsed.TotalSeconds, 1))s ($explosionRate rec/sec)" -ForegroundColor Green + + # Update checkpoint with explosion completion + if ($script:CheckpointEnabled -and $script:CheckpointData) { + $script:CheckpointData.explosion.status = 'Completed' + $script:CheckpointData.explosion.recordsProcessed = $totalRecords + $script:CheckpointData.explosion.rowsGenerated = $allExplodedRecords.Count + $script:CheckpointData.explosion.lastUpdateTime = (Get-Date).ToUniversalTime().ToString('o') + Save-CheckpointToDisk + } + + # ───────────────────────────────────────────────────────────────────────── + # PHASE 4: Apply schema and write CSV (must be serial for proper ordering) + # ───────────────────────────────────────────────────────────────────────── + Write-LogHost "Phase 3: Discovering schema from all $($allExplodedRecords.Count) rows (full scan)..." -ForegroundColor Cyan + $schemaStart = Get-Date + + $structuredDataCount = $allExplodedRecords.Count + $script:metrics.TotalStructuredRows = $structuredDataCount + $processedRecordCount = $totalRecords + + # Update per-activity Structured counts for Activity Type Breakdown display + # (Parallel explosion doesn't update these during processing - count now from consolidated results) + $activityStructuredCounts = @{} + foreach ($row in $allExplodedRecords) { + $opName = if ($row -is [hashtable]) { if ($row['Operation']) { $row['Operation'] } else { $row['Operations'] } } else { if ($row.Operation) { $row.Operation } else { $row.Operations } } + if ($opName) { + if (-not $activityStructuredCounts.ContainsKey($opName)) { $activityStructuredCounts[$opName] = 0 } + $activityStructuredCounts[$opName]++ + } + } + foreach ($opName in $activityStructuredCounts.Keys) { + if ($script:metrics.Activities.ContainsKey($opName)) { + $script:metrics.Activities[$opName].Structured = $activityStructuredCounts[$opName] + } + } + + # Build schema by scanning ALL records (not just first N) + # This ensures 100% column discovery - only reads property names, not values (fast) + $columnOrder = New-Object System.Collections.Generic.List[string] + if ($ExplodeArrays -or $ExplodeDeep -or $ForcedRawInputCsvExplosion) { + if ($IncludeM365Usage -and $RAWInputCSV) { + foreach ($c in (Get-M365UsageWideHeader -RawCsvPath $RAWInputCSV -BaseHeader $M365UsageBaseHeader)) { [void]$columnOrder.Add($c) } + } else { + foreach ($c in $PurviewExplodedHeader) { [void]$columnOrder.Add($c) } + } + if ($ExplodeDeep -and $script:DeepExtraColumns -and $script:DeepExtraColumns.Count -gt 0) { + foreach ($c in $script:DeepExtraColumns) { if (-not $columnOrder.Contains($c)) { [void]$columnOrder.Add($c) } } + } + } + # Full scan: iterate ALL exploded records to discover every column + # This is O(n) but only touches property names, not values - typically <2s for 100K rows + foreach ($sr in $allExplodedRecords) { + foreach ($pn in $sr.PSObject.Properties.Name) { + if (-not $columnOrder.Contains($pn)) { [void]$columnOrder.Add($pn) } + } + } + $schemaElapsed = (Get-Date) - $schemaStart + Write-LogHost "Schema discovered: $($columnOrder.Count) columns from $($allExplodedRecords.Count) rows in $([Math]::Round($schemaElapsed.TotalSeconds, 1))s (full scan - 100% column coverage)" -ForegroundColor Cyan + $schemaSampleRows.Clear() + $schemaFrozen = $true + + Write-LogHost "Phase 4: Writing CSV..." -ForegroundColor Cyan + $writeStart = Get-Date + + # Calculate effective chunk size for CSV writing + # OPTIMIZATION: Increased base chunk sizes since Write-CsvRows now uses column index lookup (O(1) vs O(n)) + $colCount = $columnOrder.Count + $effectiveChunkSize = $StreamingChunkSize + if ($colCount -gt 1000) { $effectiveChunkSize = [int][Math]::Min($effectiveChunkSize, 3000) } + elseif ($colCount -gt 500) { $effectiveChunkSize = [int][Math]::Min($effectiveChunkSize, 5000) } + elseif ($colCount -gt 250) { $effectiveChunkSize = [int][Math]::Min($effectiveChunkSize, 8000) } + elseif ($colCount -gt 100) { $effectiveChunkSize = [int][Math]::Max($StreamingChunkSize, 10000) } # New tier for 100-250 columns + else { $effectiveChunkSize = [int][Math]::Min(20000, [Math]::Max($StreamingChunkSize * 4, 15000)) } # Boost for ≤100 columns + $script:metrics.EffectiveChunkSize = $effectiveChunkSize + + # Open CSV writer and write all records in chunks + Open-CsvWriter -Path $exportTemp -Columns $columnOrder + $csvWriter = $true + + $writeChunkSize = $effectiveChunkSize + $totalWritten = 0 + $lastWriteProgressTime = Get-Date + $writeProgressInterval = [TimeSpan]::FromSeconds(60) + + for ($i = 0; $i -lt $allExplodedRecords.Count; $i += $writeChunkSize) { + $endIdx = [Math]::Min($i + $writeChunkSize - 1, $allExplodedRecords.Count - 1) + $chunk = $allExplodedRecords[$i..$endIdx] + + # Rows from parallel explosion are already hashtables - pass directly to CSV writer + # Removed: Select-Object -Property $columnOrder (extremely slow with 100+ columns) + if ($chunk.Count -gt 0) { + Write-CsvRows -Rows $chunk -Columns $columnOrder + $totalWritten += $chunk.Count + } + + # Progress every 60 seconds + if (((Get-Date) - $lastWriteProgressTime) -gt $writeProgressInterval) { + $writeElapsedSoFar = (Get-Date) - $writeStart + $elapsedStr = if ($writeElapsedSoFar.TotalMinutes -ge 1) { "{0}m {1}s" -f [int]$writeElapsedSoFar.TotalMinutes, $writeElapsedSoFar.Seconds } else { "{0}s" -f [int]$writeElapsedSoFar.TotalSeconds } + $pct = [int](($totalWritten / $allExplodedRecords.Count) * 100) + $rowRate = if ($writeElapsedSoFar.TotalSeconds -gt 0) { [int]($totalWritten / $writeElapsedSoFar.TotalSeconds) } else { 0 } + Write-LogHost ("[{0}] CSV Write: {1:N0}/{2:N0} rows ({3}%) | {4} cols | ~{5:N0} rows/sec | {6}" -f (Get-Date -Format "HH:mm:ss"), $totalWritten, $allExplodedRecords.Count, $pct, $columnOrder.Count, $rowRate, $elapsedStr) -ForegroundColor DarkCyan + $lastWriteProgressTime = Get-Date + } + } + + # Final Phase 3 status line + $writeElapsed = (Get-Date) - $writeStart + $elapsedStr = if ($writeElapsed.TotalMinutes -ge 1) { "{0}m {1}s" -f [int]$writeElapsed.TotalMinutes, $writeElapsed.Seconds } else { "{0}s" -f [int]$writeElapsed.TotalSeconds } + $finalRowRate = if ($writeElapsed.TotalSeconds -gt 0) { [int]($totalWritten / $writeElapsed.TotalSeconds) } else { 0 } + Write-LogHost ("[{0}] CSV Write: {1:N0}/{1:N0} rows (100%) | {2} cols | ~{3:N0} rows/sec | {4}" -f (Get-Date -Format "HH:mm:ss"), $totalWritten, $columnOrder.Count, $finalRowRate, $elapsedStr) -ForegroundColor Cyan + Write-LogHost "CSV write complete: $totalWritten rows in $([Math]::Round($writeElapsed.TotalSeconds, 1))s" -ForegroundColor Green + + # Track activity metrics (approximate - we don't have per-record operation info in parallel mode) + # This is a known limitation - parallel mode won't have detailed per-operation breakdown + + # Clean up to free memory + $allExplodedRecords.Clear() + $allExplodedRecords = $null + [System.GC]::Collect() + + $parallelElapsed = (Get-Date) - $parallelStartTime + $parallelRate = if ($parallelElapsed.TotalSeconds -gt 0) { [int]($totalRecords / $parallelElapsed.TotalSeconds) } else { 0 } + $completeDesc = if ($ExplodeArrays -or $ExplodeDeep) { "PARALLEL EXPLOSION" } else { "PARALLEL PROCESSING" } + Write-LogHost "$completeDesc COMPLETE: $totalRecords records -> $structuredDataCount rows in $([Math]::Round($parallelElapsed.TotalSeconds, 1))s total ($parallelRate rec/sec)" -ForegroundColor Green + } + # END PARALLEL BLOCK + + # Flush any remaining unfrozen schema samples (small datasets) + if (-not $schemaFrozen -and $schemaSampleRows.Count -gt 0) { + if ($ExplodeArrays -or $ExplodeDeep -or $ForcedRawInputCsvExplosion) { + $columnOrder = New-Object System.Collections.Generic.List[string]; + if ($IncludeM365Usage -and $RAWInputCSV) { + foreach ($c in (Get-M365UsageWideHeader -RawCsvPath $RAWInputCSV -BaseHeader $M365UsageBaseHeader)) { [void]$columnOrder.Add($c) } + } else { + foreach ($c in $PurviewExplodedHeader) { [void]$columnOrder.Add($c) } + } + if ($ExplodeDeep -and $script:DeepExtraColumns -and $script:DeepExtraColumns.Count -gt 0) { foreach ($c in $script:DeepExtraColumns) { if (-not $columnOrder.Contains($c)) { [void]$columnOrder.Add($c) } } } + } else { + $columnOrder = New-Object System.Collections.Generic.List[string]; foreach ($sr in $schemaSampleRows) { foreach ($pn in $sr.PSObject.Properties.Name) { if (-not $columnOrder.Contains($pn)) { [void]$columnOrder.Add($pn) } } } + } + foreach ($sr in $schemaSampleRows) { foreach ($pn in $sr.PSObject.Properties.Name) { if (-not $columnOrder.Contains($pn)) { [void]$columnOrder.Add($pn) } } } + Write-LogHost "Schema finalized with $($columnOrder.Count) columns from $($schemaSampleRows.Count) total rows (small dataset)" -ForegroundColor DarkCyan + if (-not $csvWriter) { Open-CsvWriter -Path $exportTemp -Columns $columnOrder; $csvWriter = $true } + $emitRows = @(); foreach ($sr in $schemaSampleRows) { $emitRows += ($sr | Select-Object -Property $columnOrder) }; if ($emitRows.Count -gt 0) { Write-CsvRows -Rows $emitRows -Columns $columnOrder } + $schemaSampleRows.Clear(); $schemaFrozen = $true + } + # Flush any remaining buffered rows after schema freeze + if ($schemaFrozen -and $buffer.Count -gt 0) { + $emitSet = $buffer | ForEach-Object { $_ | Select-Object -Property $columnOrder } + if (-not $csvWriter) { Open-CsvWriter -Path $exportTemp -Columns $columnOrder; $csvWriter = $true } + if ($emitSet.Count -gt 0) { Write-CsvRows -Rows $emitSet -Columns $columnOrder } + $buffer.Clear() + } + # Cleanup: ensure writer closed before export finalization. + if ($csvWriter) { try { Close-CsvWriter } catch {} } + + # Replay fallback: if no structured rows but we have raw logs, emit compact rows (non-exploded) + if ($structuredDataCount -eq 0 -and $allLogs.Count -gt 0 -and $RAWInputCSV) { + Write-LogHost "Replay fallback: emitting compact non-exploded rows" -ForegroundColor Yellow + try { + $columnOrder = @('RecordId','CreationDate','RecordType','Operation','UserId','AuditData','AssociatedAdminUnits','AssociatedAdminUnitsNames') + if (-not $csvWriter) { Open-CsvWriter -Path $exportTemp -Columns $columnOrder; $csvWriter = $true } + $fallbackCount = 0 + foreach ($log in $allLogs) { + try { + $rows = Convert-ToStructuredRecord -Record $log -EnableExplosion:$false + $rowsArr = To-RecordArray $rows + if ($rowsArr.Count -gt 0) { + $fallbackCount += $rowsArr.Count + Write-CsvRows -Rows ($rowsArr | ForEach-Object { $_ | Select-Object -Property $columnOrder }) -Columns $columnOrder + } + } catch {} + } + $structuredDataCount = $fallbackCount + try { $script:metrics.TotalStructuredRows = $fallbackCount } catch {} + Write-LogHost ("Replay fallback emitted: {0} rows" -f $fallbackCount) -ForegroundColor Yellow + } catch { Write-LogHost "Replay fallback failed: $($_.Exception.Message)" -ForegroundColor Red } + } + # Fallback: ensure temp file exists so Move-Item does not fail (very small datasets may not have flushed rows yet) + # Skip this fallback if fast-path already moved the temp file to final output + if (-not $skipToPostProcessing -and -not (Test-Path $exportTemp)) { + try { + $enc = New-Object System.Text.UTF8Encoding($false) + $sw = [System.IO.StreamWriter]::new($exportTemp, $false, $enc) + if ($columnOrder) { + $escapedCols = New-Object System.Collections.Generic.List[string] + foreach ($col in $columnOrder) { + $c = [string]$col; $needsQuote = ($c -match '[",\r\n]') -or $c.StartsWith(' ') -or $c.EndsWith(' ') + $escaped = $c -replace '"','""' + if ($needsQuote) { $escaped = '"' + $escaped + '"' } + $escapedCols.Add($escaped) | Out-Null + } + $sw.WriteLine(($escapedCols -join ',')) + } else { + $sw.WriteLine('RecordId') + } + $sw.Flush(); $sw.Dispose() + } catch { Write-LogHost "WARNING: Fallback temp file creation failed: $($_.Exception.Message)" -ForegroundColor Yellow } + } + + # Final explosion progress update (100% completion) - SERIAL MODE ONLY + # Parallel mode has its own completion summary, skip this to avoid duplicate/confusing output + # Also skip when fast path or streaming merge already handled export (skipToPostProcessing=true) + if (-not $useParallelExplosion -and -not $skipToPostProcessing) { + if ($allLogs.Count -ge $explosionProgressInterval) { + $elapsed = (Get-Date) - $te0 + $rate = [int]($processedRecordCount / $elapsed.TotalSeconds) + $elapsedStr = if ($elapsed.TotalMinutes -ge 1) { + "{0}m {1}s" -f [int]$elapsed.TotalMinutes, $elapsed.Seconds + } else { + "{0}s" -f [int]$elapsed.TotalSeconds + } + Write-LogHost ("[{0}] Processing: {1:N0} / {2:N0} records (100%) | Elapsed: {3} | Rate: {4:N0} rec/sec" -f (Get-Date -Format "HH:mm:ss"), $processedRecordCount, $allLogs.Count, $elapsedStr, $rate) -ForegroundColor DarkCyan + } + + $te1 = Get-Date; try { $script:metrics.ExplosionMs += [int]($te1 - $te0).TotalMilliseconds } catch {} + Write-LogHost "Standard processing (streamed) complete: $($allLogs.Count) input -> $structuredDataCount output" -ForegroundColor Cyan + } + + # Explain record count changes (filtering vs explosion) + if ($structuredDataCount -lt $allLogs.Count) { + $recordsFiltered = $allLogs.Count - $structuredDataCount + Write-LogHost "" + Write-LogHost " ℹ Record count decreased: $recordsFiltered parent record(s) were filtered out" -ForegroundColor Yellow + Write-LogHost " Note: Filtering happens DURING explosion (before array expansion)" -ForegroundColor DarkYellow + + # List active filters that could cause record exclusion + $activeFilters = @() + if ($PromptFilter) { + $activeFilters += "PromptFilter ($PromptFilter mode)" + Write-LogHost " • PromptFilter: Records with no matching messages were excluded" -ForegroundColor DarkYellow + } + if ($AgentId) { + $activeFilters += "AgentId filter" + Write-LogHost " • AgentId: Records not matching specified AgentId(s) were excluded" -ForegroundColor DarkYellow + } + if ($AgentsOnly) { + $activeFilters += "AgentsOnly filter" + Write-LogHost " • AgentsOnly: Records without any AgentId were excluded" -ForegroundColor DarkYellow + } + if ($ExcludeAgents) { + $activeFilters += "ExcludeAgents filter" + Write-LogHost " • ExcludeAgents: Records with AgentId present were excluded" -ForegroundColor DarkYellow + } + if ($UserIds) { + $activeFilters += "UserIds filter" + Write-LogHost " • UserIds: Records not matching specified user(s) were excluded" -ForegroundColor DarkYellow + } + if ($GroupNames) { + $activeFilters += "GroupNames filter" + Write-LogHost " • GroupNames: Records not matching group members were excluded" -ForegroundColor DarkYellow + } + + if ($activeFilters.Count -eq 0) { + Write-LogHost " Reason: Unknown (no explicit filters active, possible internal filtering)" -ForegroundColor DarkYellow + } + } + # Use stored count for streaming merge (allLogs was cleared), otherwise use allLogs.Count + $inputRecordCount = if ($script:OriginalInputRecordCount) { $script:OriginalInputRecordCount } else { $allLogs.Count } + + if ($structuredDataCount -eq $inputRecordCount -and ($ExplodeArrays -or $ExplodeDeep)) { + Write-LogHost "" + Write-LogHost " ℹ No explosion occurred (1:1 ratio)" -ForegroundColor Yellow + if ($PromptFilter) { + Write-LogHost " Reason: PromptFilter limits explosion to matching messages only" -ForegroundColor DarkYellow + Write-LogHost " Each record had exactly 1 matching message, producing 1 row per record" -ForegroundColor DarkYellow + Write-LogHost " Tip: Without PromptFilter, these records would explode to multiple rows" -ForegroundColor Cyan + } else { + Write-LogHost " Possible reasons: Records have no arrays to explode (Messages, Contexts, etc.)" -ForegroundColor DarkYellow + } + } + elseif ($inputRecordCount -gt 0 -and $structuredDataCount -gt $inputRecordCount) { + $explosionRatio = [Math]::Round($structuredDataCount / $inputRecordCount, 1) + Write-LogHost "" + Write-LogHost " Array explosion successful: ${explosionRatio}x expansion ($inputRecordCount records → $structuredDataCount rows)" -ForegroundColor Green + } + + if ($postFreezeNewColumns -gt 0) { Write-LogHost "NOTICE: $postFreezeNewColumns row(s) contained new columns after schema freeze (ignored). This only affects serial mode - increase -StreamingSchemaSample or use parallel mode (PS7+) for full coverage." -ForegroundColor DarkYellow } + Set-ProgressPhase -Phase 'Export' -Status 'Finalizing streaming CSV' + + # Handle AppendFile mode vs normal mode (skip if fast-path already handled export) + if (-not $skipToPostProcessing) { + if ($AppendFile) { + # AppendFile mode: Always create temporary CSV with new data first + $tempCsvPath = Join-Path $OutputPath "Temp_NewData_$global:ScriptRunTimestamp.csv" + $tx0 = Get-Date; Move-Item -Force -Path $exportTemp -Destination $tempCsvPath; $tx1 = Get-Date + try { $script:metrics.ExportMs += [int]($tx1 - $tx0).TotalMilliseconds } catch {} + + if ($ExportWorkbook) { + # Excel AppendFile: Keep temp CSV for later reading + # Store temp CSV path for Excel conversion to read from + $script:AppendFileTempCsv = $tempCsvPath + # $OutputFile stays as the user's target Excel file (set earlier at line 4355) + Write-LogHost "Created temporary CSV for Excel append: $tempCsvPath" -ForegroundColor Gray + } else { + # CSV AppendFile: Append new data to existing CSV + Write-LogHost "Appending new data to existing CSV: $OutputFile" -ForegroundColor Cyan + try { + # Read new data (without header) + $newLines = Get-Content -Path $tempCsvPath -ErrorAction Stop | Select-Object -Skip 1 + + # Append to existing file + Add-Content -Path $OutputFile -Value $newLines -Encoding UTF8 -ErrorAction Stop + + Write-LogHost " Appended $($newLines.Count) new record(s) to existing CSV" -ForegroundColor Green + + # Clean up temporary file + Remove-Item -Path $tempCsvPath -Force -ErrorAction SilentlyContinue + } + catch { + Write-Host "ERROR: Failed to append CSV data: $($_.Exception.Message)" -ForegroundColor Red + Write-Host " Existing file: $OutputFile" -ForegroundColor Yellow + Write-Host " New data (temp): $tempCsvPath" -ForegroundColor Yellow + Write-Host "" -ForegroundColor Yellow + Write-Host "Possible causes:" -ForegroundColor Yellow + Write-Host " • File is open in Excel or another program" -ForegroundColor Gray + Write-Host " • Insufficient permissions" -ForegroundColor Gray + Write-Host " • Column mismatch between files" -ForegroundColor Gray + Write-Host "" -ForegroundColor Yellow + Write-Host "The new data is preserved in: $tempCsvPath" -ForegroundColor Cyan + exit 1 + } + } + } else { + # Normal mode: Rename temp to final output (use CsvOutputFile for intermediate CSV when ExportWorkbook) + $tx0 = Get-Date; Move-Item -Force -Path $exportTemp -Destination $script:CsvOutputFile; $tx1 = Get-Date + try { $script:metrics.ExportMs += [int]($tx1 - $tx0).TotalMilliseconds } catch {} + } + } # End of skipToPostProcessing guard + + $script:progressState.Export.Total = 1; $script:progressState.Export.Current = 1; Update-Progress -Status 'Export complete (stream)'; Set-ProgressPhase -Phase 'Complete' -Status 'Done'; Complete-Progress + + # --- Dynamic Downgrade: If combined CSV contains data for only one activity type, rename to single-activity convention --- + # Note: Disabled when AppendFile is used to avoid renaming user-specified files + # Note: Skip for fast-path (non-explosion) runs - Import-Csv on large files is too slow + if (-not $skipToPostProcessing -and -not $ExportWorkbook -and $csvCombineMode -and (Test-Path $OutputFile) -and -not $AppendFile) { + try { + # Read all unique Operation values from the CSV + $allOperations = Import-Csv -Path $OutputFile -ErrorAction Stop | + Where-Object { -not [string]::IsNullOrWhiteSpace($_.Operation) } | + Select-Object -ExpandProperty Operation -Unique + + $distinctOps = @($allOperations) + if ($distinctOps.Count -eq 1) { + $onlyType = [string]$distinctOps[0] # Explicit cast to string to avoid array slicing + if (-not [string]::IsNullOrWhiteSpace($onlyType)) { + $safeType = $onlyType -replace '[\/:*?"<>|]', '_' + $singleName = "Purview_Audit_${safeType}_${global:ScriptRunTimestamp}.csv" + $targetPath = Join-Path $OutputPath $singleName + if ($OutputFile -ne $targetPath) { + Write-LogHost "Detected single-activity result in combined mode: '$onlyType' → Renaming output file to $singleName" -ForegroundColor Yellow + Move-Item -Force -Path $OutputFile -Destination $targetPath + $OutputFile = $targetPath + # Update parameter snapshot if present + if ($paramSnapshot -and $paramSnapshot.Contains('OutputFile')) { $paramSnapshot['OutputFile'] = $OutputFile } + } + } + } + } catch { + Write-LogHost "WARNING: Single-activity downgrade check failed: $($_.Exception.Message)" -ForegroundColor DarkYellow + } + } + } # End else (live audit log query mode) + } # End if (-not $OnlyUserInfo) - Skip all audit log queries when only exporting user data + + if ($OnlyUserInfo) { + # -OnlyUserInfo mode: Initialize empty logs collection + $allLogs = New-Object System.Collections.ArrayList + + # Handle OnlyUserInfo with ExportWorkbook - create Excel with just EntraUsers tab + if ($ExportWorkbook -and $script:EntraUsersData) { + $entraExcelFile = Join-Path $OutputPath "EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.xlsx" + try { + $entraTab = 'EntraUsers_MAClicensing' + Write-LogHost "Creating Excel workbook with $entraTab tab ($($script:EntraUsersData.Count) rows)..." -ForegroundColor Cyan + $dataTable = $script:EntraUsersData | ConvertTo-DataTable + Send-SQLDataToExcel -DataTable $dataTable -Path $entraExcelFile -WorkSheetName $entraTab -Force -FreezeTopRow -BoldTopRow -AutoSize -NoNumberConversion '*' + Write-LogHost "EntraUsers Excel workbook created: $entraExcelFile" -ForegroundColor Green + } catch { + Write-LogHost "WARNING: Failed to export EntraUsers Excel: $($_.Exception.Message)" -ForegroundColor Yellow + } + } + + # Handle OnlyUserInfo CSV export (non-workbook mode) + if (-not $ExportWorkbook -and $script:EntraUsersData) { + $entraFile = Join-Path $OutputPath "EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.csv" + try { + Write-LogHost "Exporting EntraUsers data to CSV ($($script:EntraUsersData.Count) rows)..." -ForegroundColor Cyan + $script:EntraUsersData | Export-Csv -Path $entraFile -NoTypeInformation -Encoding UTF8 -ErrorAction Stop + Write-LogHost "EntraUsers CSV created: $entraFile" -ForegroundColor Green + } catch { + Write-LogHost "WARNING: Failed to export EntraUsers CSV: $($_.Exception.Message)" -ForegroundColor Yellow + } + } + } + $csvSeparateMode = (-not $ExportWorkbook -and -not $csvCombineMode -and -not $AppendFile) + + # --- EntraUsers CSV export (for IncludeUserInfo mode with combined/append CSV) --- + # Note: OnlyUserInfo mode is handled in the dedicated block above + # Export if: not workbook mode AND IncludeUserInfo enabled (not OnlyUserInfo) AND data exists + if (-not $ExportWorkbook -and -not $OnlyUserInfo -and $IncludeUserInfo -and $script:EntraUsersData) { + $entraFile = Join-Path $OutputPath "EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.csv" + try { + if ($script:EntraUsersData.Count -gt 0) { + $script:EntraUsersData | Export-Csv -Path $entraFile -NoTypeInformation -Encoding UTF8 -ErrorAction Stop + } else { + # Header-only export + $header = ($script:EntraUsersData | Select-Object -First 1 | Get-Member -MemberType NoteProperty | Select-Object -ExpandProperty Name) + if ($header) { ($null | Select-Object $header) | Export-Csv -Path $entraFile -NoTypeInformation -Encoding UTF8 } + } + Write-LogHost "EntraUsers CSV created: $entraFile" -ForegroundColor Green + } catch { Write-LogHost "WARNING: Failed to export EntraUsers CSV: $($_.Exception.Message)" -ForegroundColor Yellow } + } + + if ($csvSeparateMode -and -not $OnlyUserInfo -and (Test-Path $OutputFile)) { + Write-LogHost "" + Write-LogHost "=== Splitting CSV by Activity Type ===" -ForegroundColor Cyan + Write-LogHost "Reading combined CSV: $OutputFile" -ForegroundColor Gray + + try { + # Read combined CSV + $allRecords = Import-Csv -Path $OutputFile -ErrorAction Stop + Write-LogHost "Loaded $($allRecords.Count) records from combined CSV" -ForegroundColor Gray + + # Group by Operation field + $groupedRecords = $allRecords | Group-Object -Property Operation + Write-LogHost "Found $($groupedRecords.Count) activity types" -ForegroundColor Gray + + # Write separate CSV files + $outputDir = Split-Path $OutputFile -Parent + $createdFiles = @() + + foreach ($group in $groupedRecords) { + $activityType = $group.Name + if ([string]::IsNullOrWhiteSpace($activityType)) { $activityType = "Unknown" } + + # Sanitize filename (remove invalid characters) + $safeActivityName = $activityType -replace '[\\/:*?"<>|]', '_' + # Base name already contains full prefix+timestamp, just prepend activity type + $fileName = "Purview_Audit_${safeActivityName}_${global:ScriptRunTimestamp}.csv" + $filePath = Join-Path $outputDir $fileName # Export to separate CSV + $group.Group | Export-Csv -Path $filePath -NoTypeInformation -Encoding UTF8 -ErrorAction Stop + $createdFiles += $filePath + + Write-LogHost " • $activityType → $fileName ($($group.Count) records)" -ForegroundColor DarkCyan + } + + # Delete combined CSV file + Remove-Item -Path $OutputFile -Force -ErrorAction SilentlyContinue + Write-LogHost "Removed combined CSV (replaced with $($createdFiles.Count) separate files)" -ForegroundColor Gray + + # Export EntraUsers CSV in separated mode + if ($IncludeUserInfo -and $script:EntraUsersData) { + $entraFile = Join-Path $outputDir "EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.csv" + try { + $script:EntraUsersData | Export-Csv -Path $entraFile -NoTypeInformation -Encoding UTF8 -ErrorAction Stop + Write-LogHost " • EntraUsers → $(Split-Path -Leaf $entraFile) ($($script:EntraUsersData.Count) rows)" -ForegroundColor DarkCyan + $createdFiles += $entraFile + } catch { Write-LogHost "WARNING: Failed to export EntraUsers CSV: $($_.Exception.Message)" -ForegroundColor Yellow } + } + + # Update OutputFile to point to directory for summary message + $script:CsvSplitFiles = $createdFiles + Write-LogHost "CSV splitting complete: $($createdFiles.Count) files created" -ForegroundColor Green + + } catch { + Write-LogHost "WARNING: CSV splitting failed: $($_.Exception.Message)" -ForegroundColor Yellow + Write-LogHost "Combined CSV retained at: $OutputFile" -ForegroundColor Yellow + } + } + + # --- Excel Post-Processing Conversion (Option A) --- + if ($ExportWorkbook -and $script:CsvOutputFile -and (Test-Path $script:CsvOutputFile)) { + Write-LogHost "" + Write-LogHost "=== Converting CSV to Excel ===" -ForegroundColor Cyan + + # Excel filename already determined at script start in $OutputFile + $excelFilePath = $OutputFile + + # Handle AppendFile mode vs normal mode + # Note: If AppendFile mode, temp CSV was already created during CSV export (stored in $script:AppendFileTempCsv) + if ($AppendFile) { + # AppendFile mode: Use user-specified Excel file and temp CSV + $excelFilePath = $OutputFile # User's target Excel file (set at line 4355) + $csvFilePath = $script:AppendFileTempCsv # Temp CSV with new data (set at line 7557) + + Write-LogHost "AppendFile mode: Appending to existing workbook: $excelFilePath" -ForegroundColor Cyan + Write-LogHost " Reading new data from: $csvFilePath" -ForegroundColor Gray + + # Pre-flight: Test file accessibility before trying Excel operations + try { + $fileStream = [System.IO.File]::Open($excelFilePath, [System.IO.FileMode]::Open, [System.IO.FileAccess]::Read, [System.IO.FileShare]::ReadWrite) + $fileStream.Close() + $fileStream.Dispose() + } + catch { + Write-Host "ERROR: Cannot access file for reading: $($_.Exception.Message)" -ForegroundColor Red + Write-Host " File: $excelFilePath" -ForegroundColor Yellow + Write-Host "" -ForegroundColor Yellow + Write-Host "Common causes:" -ForegroundColor Yellow + Write-Host " • File is currently open in Excel with exclusive lock (close it and try again)" -ForegroundColor Gray + Write-Host " • File is in a OneDrive/SharePoint folder with sync issues (check file status)" -ForegroundColor Gray + Write-Host " • Insufficient permissions to read the file" -ForegroundColor Gray + Write-Host "" -ForegroundColor Yellow + Write-Host "Troubleshooting steps:" -ForegroundColor Cyan + Write-Host " 1. Close Excel if the file is open" -ForegroundColor Gray + Write-Host " 2. Copy file to a local folder (C:\temp) and retry" -ForegroundColor Gray + Write-Host " 3. Verify file permissions and OneDrive sync status" -ForegroundColor Gray + exit 1 + } + + # Read existing sheets inline for validation + try { + # Ensure ImportExcel module is loaded + if (-not (Get-Module -Name ImportExcel)) { + Write-Host "WARNING: ImportExcel module not loaded, attempting to load..." -ForegroundColor Yellow + Import-Module ImportExcel -ErrorAction Stop + } + + $existingSheets = Get-ExcelSheetInfo -Path $excelFilePath | Select-Object -ExpandProperty Name + $script:ExistingExcelSheets = $existingSheets + Write-LogHost " Existing sheets: $($existingSheets -join ', ')" -ForegroundColor DarkGray + } + catch { + # First, validate it's actually an Excel file (ZIP container with xl/workbook.xml) + Write-Host "ERROR: Cannot read Excel workbook structure: $($_.Exception.Message)" -ForegroundColor Red + Write-Host " File: $excelFilePath" -ForegroundColor Yellow + Write-Host "" -ForegroundColor Yellow + + # Try to determine the actual issue + $isValidZip = $false + $hasWorkbookXml = $false + try { + Add-Type -AssemblyName System.IO.Compression.FileSystem -ErrorAction SilentlyContinue + $zip = [System.IO.Compression.ZipFile]::OpenRead($excelFilePath) + $isValidZip = $true + $hasWorkbookXml = $zip.Entries | Where-Object { $_.FullName -eq 'xl/workbook.xml' } + $zip.Dispose() + } + catch { + # Not a valid ZIP + } + + if (-not $isValidZip) { + Write-Host "Root cause: File is not a valid ZIP archive (Excel files are ZIP containers)" -ForegroundColor Yellow + Write-Host "" -ForegroundColor Yellow + Write-Host "Possible causes:" -ForegroundColor Yellow + Write-Host " • File was renamed from .csv to .xlsx (not converted)" -ForegroundColor Gray + Write-Host " • File download was interrupted or corrupted" -ForegroundColor Gray + Write-Host " • File created by incompatible tool" -ForegroundColor Gray + } + elseif (-not $hasWorkbookXml) { + Write-Host "Root cause: ZIP file is valid but missing 'xl/workbook.xml' (not a proper Excel workbook)" -ForegroundColor Yellow + Write-Host "" -ForegroundColor Yellow + Write-Host "Possible causes:" -ForegroundColor Yellow + Write-Host " • File is corrupted or incomplete" -ForegroundColor Gray + Write-Host " • File created by tool that doesn't follow Excel format" -ForegroundColor Gray + } + else { + Write-Host "Root cause: ImportExcel module cannot parse this workbook" -ForegroundColor Yellow + Write-Host "" -ForegroundColor Yellow + Write-Host "Possible causes:" -ForegroundColor Yellow + Write-Host " • ImportExcel module version incompatibility" -ForegroundColor Gray + Write-Host " • File created by different ImportExcel version" -ForegroundColor Gray + Write-Host " • Workbook has features ImportExcel can't parse" -ForegroundColor Gray + Write-Host "" -ForegroundColor Yellow + Write-Host "Current ImportExcel version:" -ForegroundColor Cyan + try { + $importExcelModule = Get-Module -Name ImportExcel -ListAvailable | Select-Object -First 1 + Write-Host " $($importExcelModule.Version)" -ForegroundColor Gray + } + catch { + Write-Host " Unable to detect version" -ForegroundColor Gray + } + } + + Write-Host "" -ForegroundColor Yellow + Write-Host "Recommended solutions:" -ForegroundColor Cyan + Write-Host " 1. Open file in Excel and verify it opens correctly" -ForegroundColor Gray + Write-Host " 2. If it opens: File > Save As > Excel Workbook (.xlsx) to 'clean' it" -ForegroundColor Gray + Write-Host " 3. Update ImportExcel: Update-Module ImportExcel -Force" -ForegroundColor Gray + Write-Host " 4. Recreate initial export without -AppendFile using current script" -ForegroundColor Gray + Write-Host "" -ForegroundColor Yellow + Write-Host "DEBUG INFO:" -ForegroundColor Cyan + Write-Host " File exists: $(Test-Path $excelFilePath)" -ForegroundColor Gray + Write-Host " File size: $((Get-Item $excelFilePath -ErrorAction SilentlyContinue).Length) bytes" -ForegroundColor Gray + Write-Host " File extension: $([System.IO.Path]::GetExtension($excelFilePath))" -ForegroundColor Gray + Write-Host " First 4 bytes (hex): " -NoNewline -ForegroundColor Gray + try { + $bytes = [System.IO.File]::ReadAllBytes($excelFilePath) | Select-Object -First 4 + Write-Host "$(($bytes | ForEach-Object { $_.ToString('X2') }) -join ' ')" -ForegroundColor Gray + if ($bytes[0] -eq 0x50 -and $bytes[1] -eq 0x4B) { + Write-Host " (Valid ZIP signature: PK)" -ForegroundColor Green + } else { + Write-Host " (NOT a ZIP file - should start with 'PK' = 50 4B)" -ForegroundColor Red + } + } + catch { + Write-Host "Unable to read bytes" -ForegroundColor Gray + } + exit 1 + } + } else { + # Normal mode: CSV path uses the intermediate CsvOutputFile + $csvFilePath = $script:CsvOutputFile + } + + # Excel conversion - use fast path when possible + try { + if ($CombineOutput) { + # --- Combined Mode: Single-tab workbook --- + $tabName = "CombinedUsageActivity" + + if ($AppendFile -and $script:ExistingExcelSheets -contains $tabName) { + # Append mode: Need to validate headers (requires loading CSV) + Write-LogHost "Reading CSV data for header validation: $csvFilePath" -ForegroundColor Gray + $csvData = Import-Csv -Path $csvFilePath -ErrorAction Stop + $totalRows = $csvData.Count + Write-LogHost "Loaded $totalRows rows from CSV" -ForegroundColor Gray + + # Validate headers match + Write-LogHost "Validating headers for tab: $tabName" -ForegroundColor Gray + $existingWorkbook = Import-Excel -Path $excelFilePath -WorksheetName $tabName -StartRow 1 -EndRow 1 -NoHeader + $existingHeaders = $existingWorkbook[0].PSObject.Properties.Value | Where-Object { $_ } + $newHeaders = $csvData[0].PSObject.Properties.Name + + $headerMismatch = $false + if ($existingHeaders.Count -ne $newHeaders.Count) { + $headerMismatch = $true + } else { + for ($i = 0; $i -lt $existingHeaders.Count; $i++) { + if ($existingHeaders[$i] -ne $newHeaders[$i]) { + $headerMismatch = $true + break + } + } + } + + if ($headerMismatch) { + # Create timestamped duplicate tab + $timestampedTabName = "${tabName}_$excelTimestamp" + Write-LogHost "WARNING: Header mismatch detected for tab '$tabName'" -ForegroundColor Yellow + Write-LogHost "Creating timestamped duplicate tab: $timestampedTabName" -ForegroundColor Yellow + $dataTable = $csvData | ConvertTo-DataTable + Send-SQLDataToExcel -DataTable $dataTable -Path $excelFilePath -WorkSheetName $timestampedTabName -Force -FreezeTopRow -BoldTopRow -AutoSize -NoNumberConversion '*' + } else { + # Append to existing tab + Write-LogHost "Appending to existing tab: $tabName" -ForegroundColor Gray + $dataTable = $csvData | ConvertTo-DataTable + Send-SQLDataToExcel -DataTable $dataTable -Path $excelFilePath -WorkSheetName $tabName -Append -FreezeTopRow -BoldTopRow -AutoSize -NoNumberConversion '*' + } + } else { + # Create new tab or new workbook + Write-LogHost "Creating tab: $tabName" -ForegroundColor Gray + $dataTable = Import-CsvToDataTable -Path $csvFilePath + $totalRows = $dataTable.Rows.Count + Send-SQLDataToExcel -DataTable $dataTable -Path $excelFilePath -WorkSheetName $tabName -Force -FreezeTopRow -BoldTopRow -AutoSize -NoNumberConversion '*' + # Append EntraUsers tab if requested + if ($IncludeUserInfo -and $script:EntraUsersData) { + $entraTab = 'EntraUsers_MAClicensing' + Write-LogHost "Creating tab: $entraTab" -ForegroundColor Gray + $entraDataTable = $script:EntraUsersData | ConvertTo-DataTable + Send-SQLDataToExcel -DataTable $entraDataTable -Path $excelFilePath -WorkSheetName $entraTab -FreezeTopRow -BoldTopRow -AutoSize -NoNumberConversion '*' + } + } + + Write-LogHost "Excel workbook created: $excelFilePath" -ForegroundColor Green + Write-LogHost " Tab: $tabName | Rows: $totalRows" -ForegroundColor White } else { + # --- Multi-tab Mode: One tab per activity type --- + # Multi-tab mode requires loading CSV for grouping + Write-LogHost "Reading CSV data for multi-tab grouping: $csvFilePath" -ForegroundColor Gray + $csvData = Import-Csv -Path $csvFilePath -ErrorAction Stop + Write-LogHost "Loaded $($csvData.Count) rows from CSV" -ForegroundColor Gray + + # Group CSV data by Operation column + $groupedData = $csvData | Group-Object -Property Operation + + # Calculate total tab count (activity types + EntraUsers if present) + $activityTabCount = $groupedData.Count + $totalTabCount = $activityTabCount + if ($IncludeUserInfo -and $script:EntraUsersData) { + $totalTabCount += 1 # Add EntraUsers tab + } + + $tabLabel = if ($totalTabCount -eq 1) { "tab" } else { "tabs" } + Write-LogHost "Creating multi-tab workbook with $totalTabCount $tabLabel" -ForegroundColor Gray + + $tabsCreated = @() + foreach ($group in $groupedData) { + $activityType = $group.Name + $activityData = $group.Group + $activityRows = $activityData.Count + + if ($AppendFile -and $script:ExistingExcelSheets -contains $activityType) { + # Validate headers match + Write-LogHost "Validating headers for tab: $activityType" -ForegroundColor Gray + $existingWorkbook = Import-Excel -Path $excelFilePath -WorksheetName $activityType -StartRow 1 -EndRow 1 -NoHeader + $existingHeaders = $existingWorkbook[0].PSObject.Properties.Value | Where-Object { $_ } + $newHeaders = $activityData[0].PSObject.Properties.Name + + $headerMismatch = $false + if ($existingHeaders.Count -ne $newHeaders.Count) { + $headerMismatch = $true + } else { + for ($i = 0; $i -lt $existingHeaders.Count; $i++) { + if ($existingHeaders[$i] -ne $newHeaders[$i]) { + $headerMismatch = $true + break + } + } + } + + if ($headerMismatch) { + # Create timestamped duplicate tab + $timestampedTabName = "${activityType}_$excelTimestamp" + Write-LogHost "WARNING: Header mismatch detected for tab '$activityType'" -ForegroundColor Yellow + Write-LogHost "Creating timestamped duplicate tab: $timestampedTabName" -ForegroundColor Yellow + $activityDataTable = $activityData | ConvertTo-DataTable + Send-SQLDataToExcel -DataTable $activityDataTable -Path $excelFilePath -WorkSheetName $timestampedTabName -Force -FreezeTopRow -BoldTopRow -AutoSize -NoNumberConversion '*' + $tabsCreated += "$timestampedTabName ($activityRows rows)" + } else { + # Append to existing tab + Write-LogHost "Appending to existing tab: $activityType" -ForegroundColor Gray + $activityDataTable = $activityData | ConvertTo-DataTable + Send-SQLDataToExcel -DataTable $activityDataTable -Path $excelFilePath -WorkSheetName $activityType -Append -FreezeTopRow -BoldTopRow -AutoSize -NoNumberConversion '*' + $tabsCreated += "$activityType ($activityRows rows appended)" + } + } else { + # Create new tab + Write-LogHost "Creating tab: $activityType ($activityRows rows)" -ForegroundColor Gray + $activityDataTable = $activityData | ConvertTo-DataTable + Send-SQLDataToExcel -DataTable $activityDataTable -Path $excelFilePath -WorkSheetName $activityType -Force -FreezeTopRow -BoldTopRow -AutoSize -NoNumberConversion '*' + $tabsCreated += "$activityType ($activityRows rows)" + } + + + # After creating all activity tabs, append EntraUsers if requested + if ($IncludeUserInfo -and $script:EntraUsersData) { + $entraTab = 'EntraUsers_MAClicensing' + Write-LogHost "Creating tab: $entraTab ($($script:EntraUsersData.Count) rows)" -ForegroundColor Gray + $entraDataTable = $script:EntraUsersData | ConvertTo-DataTable + Send-SQLDataToExcel -DataTable $entraDataTable -Path $excelFilePath -WorkSheetName $entraTab -FreezeTopRow -BoldTopRow -AutoSize -NoNumberConversion '*' + $tabsCreated += "$entraTab ($($script:EntraUsersData.Count) rows)" + } + } + + + Write-LogHost "Excel workbook created: $excelFilePath" -ForegroundColor Green + Write-LogHost " Tabs: $($tabsCreated -join ', ')" -ForegroundColor White + } + # Delete temporary CSV file (with retry for file lock issues) + if ($AppendFile -and $script:AppendFileTempCsv) { + # AppendFile mode: Remove the temp CSV we created for new data + Write-LogHost "Removing temporary CSV file: $script:AppendFileTempCsv" -ForegroundColor Gray + Remove-Item -Path $script:AppendFileTempCsv -Force -ErrorAction SilentlyContinue + } elseif (-not $AppendFile) { + # Normal mode: Remove the intermediate CSV that was converted to Excel + Write-LogHost "Removing temporary CSV file: $script:CsvOutputFile" -ForegroundColor Gray + # Retry with delay to handle transient file locks (antivirus, OneDrive sync, etc.) + $deleteSuccess = $false + for ($retryCount = 0; $retryCount -lt 3; $retryCount++) { + try { + # Force garbage collection to release any .NET file handles + [System.GC]::Collect() + [System.GC]::WaitForPendingFinalizers() + Start-Sleep -Milliseconds 500 + Remove-Item -Path $script:CsvOutputFile -Force -ErrorAction Stop + $deleteSuccess = $true + break + } catch { + if ($retryCount -lt 2) { + Write-LogHost " File locked, retrying in 2 seconds... (attempt $($retryCount + 1)/3)" -ForegroundColor DarkYellow + Start-Sleep -Seconds 2 + } + } + } + if (-not $deleteSuccess) { + Write-LogHost " Could not delete temp CSV (file may be locked by another process)" -ForegroundColor Yellow + Write-LogHost " CSV file preserved at: $script:CsvOutputFile" -ForegroundColor Yellow + } + } + # Note: $OutputFile already points to the final Excel file + } catch { + Write-LogHost "ERROR: Failed to convert CSV to Excel: $($_.Exception.Message)" -ForegroundColor Red + Write-LogHost "CSV file preserved at: $script:CsvOutputFile" -ForegroundColor Yellow + } + } + + # ============================================================ + # CHECKPOINT COMPLETION: Rename _PARTIAL file and delete checkpoint + # ============================================================ + if ($script:CheckpointEnabled -and $script:PartialOutputPath -and (Test-Path $script:PartialOutputPath)) { + Complete-CheckpointRun -FinalOutputPath $script:FinalOutputPath + # Update OutputFile to point to final path (without _PARTIAL) for correct display + $OutputFile = $script:FinalOutputPath + $LogFile = $script:LogFile # Also update LogFile variable (was updated by Complete-CheckpointRun) + } + + # ============================================================ + # FIX 37: FALLBACK LOG RENAME — remove _PARTIAL from log file + # When CSV split mode deletes the combined _PARTIAL.csv before + # Complete-CheckpointRun runs, the Test-Path guard above fails + # and the log file is never renamed. This catch-all handles that + # case (and any other code path that could skip the rename). + # GUARD: Only rename on genuinely completed runs — interrupted + # or failed runs MUST keep _PARTIAL so Resume mode can detect them. + # ============================================================ + if ($script:LogFile -and $script:LogFile -match '_PARTIAL\.log$' -and (Test-Path $script:LogFile) -and -not $script:CtrlCPressed -and -not $script:EarlyExit) { + try { + $finalLogPath = $script:LogFile -replace '_PARTIAL\.log$', '.log' + if (Test-Path $finalLogPath) { + $logDir = Split-Path $finalLogPath -Parent + $logName = [System.IO.Path]::GetFileNameWithoutExtension($finalLogPath) + $ts = Get-Date -Format 'yyyyMMdd_HHmmss' + $finalLogPath = Join-Path $logDir "${logName}_${ts}.log" + } + Move-Item -Path $script:LogFile -Destination $finalLogPath -Force + $script:LogFile = $finalLogPath + $LogFile = $finalLogPath + } catch { + # Non-fatal: log file keeps _PARTIAL suffix but data is intact + } + } + + Write-LogHost ""; Write-LogHost "=== Enterprise Export Complete ===" -ForegroundColor Green + + if ($OnlyUserInfo) { + # User-only export mode summary + if ($script:EntraUsersData) { + Write-LogHost "Entra users exported: $($script:EntraUsersData.Count)" -ForegroundColor White + } + } else { + # Standard audit log export summary + Write-LogHost "Processing mode: $processingMode" -ForegroundColor White + Write-LogHost "Records exported: $($script:metrics.TotalStructuredRows)" -ForegroundColor White + if ($IncludeUserInfo -and -not $UseEOM -and $script:EntraUsersData) { + Write-LogHost "Entra users exported: $($script:EntraUsersData.Count)" -ForegroundColor White + } + } + + + # Display organized tail summary metrics - only for audit log exports + if (-not $OnlyUserInfo -and $script:metrics -and $script:metrics.Activities -and $script:metrics.Activities.Count -gt 0) { + Write-LogHost "" + Write-LogHost "=== Activity Type Breakdown ===" -ForegroundColor Cyan + + # In resume mode with skipped partitions, the "Retrieved" counts only reflect THIS run's fetches + # Records from previously-completed partitions (merged from incremental saves) are not counted in Retrieved + if ($script:IsResumeMode -and $script:OriginallySkippedPartitionIndices -and $script:OriginallySkippedPartitionIndices.Count -gt 0) { + Write-LogHost " Note: Resume mode - 'Retrieved' counts reflect only this run's fetches" -ForegroundColor DarkGray + Write-LogHost " (excludes $($script:OriginallySkippedPartitionIndices.Count) previously-completed partition(s))" -ForegroundColor DarkGray + Write-LogHost "" + } + + $totalRetrieved = $script:metrics.TotalRecordsFetched + $totalExported = $script:metrics.TotalStructuredRows + $totalFiltered = $script:metrics.FilteringSkippedRecords + + foreach ($actKey in ($script:metrics.Activities.Keys | Sort-Object)) { + $actStats = $script:metrics.Activities[$actKey] + $retrieved = $actStats.Retrieved + $structured = $actStats.Structured + $ratio = if ($retrieved -gt 0) { [Math]::Round($structured / $retrieved, 1) } else { 0 } + + Write-LogHost " $actKey" -ForegroundColor White + Write-LogHost " Retrieved: $retrieved records" -ForegroundColor Gray + + # Show filtering breakdown if records were filtered for this activity type + $actFiltered = $retrieved - $structured + if ($actFiltered -gt 0 -and $ratio -le 1) { + Write-LogHost " Filtered: $actFiltered records" -ForegroundColor DarkYellow + + # Only show filters that were actually applied (check parameter + impact) + # Technical failures (always show if present) + if ($script:metrics.FilteringMissingAuditData -gt 0) { + Write-LogHost " - Missing/invalid AuditData: $($script:metrics.FilteringMissingAuditData)" -ForegroundColor DarkGray + } + if ($script:metrics.FilteringParseFailures -gt 0) { + Write-LogHost " - Parse failures: $($script:metrics.FilteringParseFailures)" -ForegroundColor DarkGray + } + + # User-specified filters (only show if parameter was used) + if ($PromptFilter) { + $promptCount = if ($script:metrics.FilteringPromptFiltered -gt 0) { $script:metrics.FilteringPromptFiltered } else { 0 } + Write-LogHost " - PromptFilter ($PromptFilter): $promptCount" -ForegroundColor DarkGray + } + if ($AgentId) { + $agentCount = if ($script:metrics.FilteringAgentFiltered -gt 0) { $script:metrics.FilteringAgentFiltered } else { 0 } + Write-LogHost " - AgentId filter: $agentCount" -ForegroundColor DarkGray + } + if ($AgentsOnly) { + $agentCount = if ($script:metrics.FilteringAgentFiltered -gt 0) { $script:metrics.FilteringAgentFiltered } else { 0 } + Write-LogHost " - AgentsOnly filter: $agentCount" -ForegroundColor DarkGray + } + if ($ExcludeAgents) { + $excludeCount = if ($script:metrics.FilteringExcludeAgents -gt 0) { $script:metrics.FilteringExcludeAgents } else { 0 } + Write-LogHost " - ExcludeAgents filter: $excludeCount" -ForegroundColor DarkGray + } + if ($UserIds) { + $userCount = if ($script:metrics.FilteringUserIds -gt 0) { $script:metrics.FilteringUserIds } else { 0 } + Write-LogHost " - UserIds filter: $userCount" -ForegroundColor DarkGray + } + if ($GroupNames) { + $groupCount = if ($script:metrics.FilteringGroupNames -gt 0) { $script:metrics.FilteringGroupNames } else { 0 } + Write-LogHost " - GroupNames filter: $groupCount" -ForegroundColor DarkGray + } + + # Calculate explained filtering count + $explainedFiltering = 0 + $explainedFiltering += $script:metrics.FilteringMissingAuditData + $explainedFiltering += $script:metrics.FilteringParseFailures + if ($PromptFilter) { $explainedFiltering += $script:metrics.FilteringPromptFiltered } + if ($AgentId) { $explainedFiltering += $script:metrics.FilteringAgentFiltered } + if ($AgentsOnly) { $explainedFiltering += $script:metrics.FilteringAgentFiltered } + if ($ExcludeAgents) { $explainedFiltering += $script:metrics.FilteringExcludeAgents } + if ($UserIds) { $explainedFiltering += $script:metrics.FilteringUserIds } + if ($GroupNames) { $explainedFiltering += $script:metrics.FilteringGroupNames } + + # Show unspecified reason if filtered count doesn't match explained reasons + if ($explainedFiltering -eq 0 -or $actFiltered -gt $explainedFiltering) { + $unexplained = $actFiltered - $explainedFiltering + if ($unexplained -gt 0) { + $yieldLabel = if ($ExplodeArrays -or $ExplodeDeep) { "Array explosion yield" } else { "Processing yield" } + Write-LogHost " - ${yieldLabel}: $unexplained" -ForegroundColor DarkGray + } + } + } + + # Show explosion details if exploding mode is enabled AND ratio > 1 + if (($ExplodeArrays -or $ExplodeDeep) -and $ratio -gt 1 -and -not $ExcelOutput) { + Write-LogHost " Exported: $structured rows (${ratio}x expansion)" -ForegroundColor Gray + if ($script:metrics.ExplosionEvents -gt 0) { + $avgExpansion = if ($script:metrics.ExplosionEvents -gt 0) { + [Math]::Round(($script:metrics.ExplosionRowsFromEvents / $script:metrics.ExplosionEvents) + 1, 1) + } else { 1 } + Write-LogHost " - Avg expansion: ${avgExpansion}x per record" -ForegroundColor DarkGray + if ($script:metrics.ExplosionMaxPerRecord -gt 0) { + Write-LogHost " - Max expansion: $($script:metrics.ExplosionMaxPerRecord)x (single record)" -ForegroundColor DarkGray + } + } + } else { + # Always show exported count for consistency + Write-LogHost " Exported: $structured rows" -ForegroundColor Gray + } + } + + # Final pipeline summary + Write-LogHost "" + Write-LogHost "Pipeline Summary:" -ForegroundColor Cyan + Write-LogHost " Retrieved: $totalRetrieved records" -ForegroundColor White + if ($totalFiltered -gt 0) { + Write-LogHost " Filtered: $totalFiltered records" -ForegroundColor White + } + Write-LogHost " Exported: $totalExported rows" -ForegroundColor White + # Show duplicate removal count in Pipeline Summary when streaming merge deduplicated records + if ($script:StreamingMergeDuplicatesSkipped -gt 0) { + Write-LogHost " Deduped: $($script:StreamingMergeDuplicatesSkipped) duplicate records removed" -ForegroundColor DarkGray + } + # Show date-range trim count in Pipeline Summary + if ($script:DateTrimCount -gt 0) { + Write-LogHost " Trimmed: $($script:DateTrimCount) record(s) outside requested date range" -ForegroundColor DarkGray + } + # Show data loss warning in Pipeline Summary if partitions were missing + if ($script:StreamingMergeDataLoss) { + Write-LogHost " [DATA-LOSS] WARNING: Output is PARTIAL — missing partitions: $($script:StreamingMergeMissingPartitions -join ', ')" -ForegroundColor Yellow + } + } + + # Export telemetry CSV for Graph API parallel execution analysis (one row per partition) - only when -IncludeTelemetry switch is used + # Always timestamped to prevent overwriting previous telemetry data + if ($IncludeTelemetry -and -not $OnlyUserInfo -and -not $UseEOM -and $script:telemetryData -and $script:telemetryData.Count -gt 0) { + try { + $baseName = [System.IO.Path]::GetFileNameWithoutExtension($OutputFile) + $outputDir = Split-Path $OutputFile -Parent + $telemetryPath = Join-Path $outputDir "${baseName}_telemetry_$global:ScriptRunTimestamp.csv" + $script:telemetryData | Export-Csv -Path $telemetryPath -NoTypeInformation -Encoding UTF8 + Write-LogHost "" + Write-LogHost "Graph API Telemetry: $telemetryPath" -ForegroundColor Cyan + } catch { + Write-LogHost "Warning: Failed to export telemetry CSV: $($_.Exception.Message)" -ForegroundColor Yellow + } + } + # DSPM for AI: Log DSPM features enabled + if ($IncludeDSPMForAI) { + Write-LogHost "" + Write-LogHost "DSPM for AI Features:" -ForegroundColor Cyan + Write-LogHost " Activity types: ConnectedAIAppInteraction, AIInteraction, AIAppInteraction" -ForegroundColor Cyan + if ($ExcludeCopilotInteraction) { + Write-LogHost " ✗ CopilotInteraction: EXCLUDED" -ForegroundColor Red + } + } + Write-LogHost "" + + # File output summary (skip for -OnlyUserInfo mode) + if (-not $OnlyUserInfo) { + if ($ExportWorkbook -and $OutputFile -match '\.xlsx$') { + Write-LogHost "Output workbook: $OutputFile" -ForegroundColor White + Write-LogHost "Workbook mode: $(if ($CombineOutput) { 'Single-tab (Combined)' } else { 'Multi-tab (By Activity Type)' })" -ForegroundColor White + if ($IncludeUserInfo -and -not $UseEOM) { Write-LogHost "Entra Users Tab: EntraUsers_MAClicensing" -ForegroundColor Gray } + if (Test-Path $OutputFile) { + Write-LogHost "File size: $([math]::Round((Get-Item $OutputFile).Length / 1KB,2)) KB" -ForegroundColor White + } + if ($AppendFile) { + Write-LogHost "Append mode: Enabled" -ForegroundColor Cyan + } + } elseif ($script:CsvSplitFiles -and $script:CsvSplitFiles.Count -gt 0) { + # CSV was split into multiple files + Write-LogHost "Output directory: $(Split-Path $OutputFile -Parent)" -ForegroundColor White + Write-LogHost "Files created: $($script:CsvSplitFiles.Count) separate CSV files" -ForegroundColor White + $totalSize = ($script:CsvSplitFiles | ForEach-Object { (Get-Item $_).Length } | Measure-Object -Sum).Sum + Write-LogHost "Total size: $([math]::Round($totalSize / 1KB,2)) KB" -ForegroundColor White + if ($IncludeUserInfo -and -not $UseEOM) { + $entraSplit = (Join-Path (Split-Path $OutputFile -Parent) "EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.csv") + if (Test-Path $entraSplit) { $entraSize = [math]::Round((Get-Item $entraSplit).Length / 1KB,2); Write-LogHost "Entra Users File: $(Split-Path $entraSplit -Leaf) ($entraSize KB)" -ForegroundColor Gray } else { Write-LogHost "Entra Users File: $(Split-Path $entraSplit -Leaf) (pending generation)" -ForegroundColor Gray } + } + + # Show filename pattern instead of listing each file + $outputDir = Split-Path $OutputFile -Parent + $timestamp = [System.IO.Path]::GetFileNameWithoutExtension($script:CsvSplitFiles[0]) -replace '.*_(\d{8}_\d{6}).*', '$1' + Write-LogHost "Output pattern: ${outputDir}\Purview_Audit__${timestamp}.csv" -ForegroundColor Gray + } elseif (Test-Path $OutputFile) { + Write-LogHost "Output file: $OutputFile" -ForegroundColor White + Write-LogHost "File size: $([math]::Round((Get-Item $OutputFile).Length / 1KB,2)) KB" -ForegroundColor White + if ($IncludeUserInfo -and -not $UseEOM) { + $entraCombined = (Join-Path (Split-Path $OutputFile -Parent) "EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.csv") + if (Test-Path $entraCombined) { $entraSize = [math]::Round((Get-Item $entraCombined).Length / 1KB,2); Write-LogHost "Entra Users File: $entraCombined ($entraSize KB)" -ForegroundColor Gray } else { Write-LogHost "Entra Users File: $entraCombined (pending)" -ForegroundColor Gray } + } + } else { + Write-LogHost "Output file: $OutputFile" -ForegroundColor White + if ($IncludeUserInfo -and -not $UseEOM) { + $entraCombined = (Join-Path (Split-Path $OutputFile -Parent) "EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.csv") + Write-LogHost "Entra Users File: $entraCombined (pending)" -ForegroundColor Gray + } + Write-LogHost "File size: N/A (file may have been deleted/moved during processing)" -ForegroundColor DarkGray + } + + # Emit header-only CSV for combined mode when zero rows exported + # Use the metric directly in case $totalExported wasn't set (resume mode with no new fetches) + if ($CombineOutput -and -not $ExportWorkbook -and ([int]$script:metrics.TotalStructuredRows -eq 0)) { + try { + $headerColumns = if ($ExplodeDeep -or $ExplodeArrays -or $ForcedRawInputCsvExplosion) { $PurviewExplodedHeader } else { @('RecordId', 'CreationDate', 'RecordType', 'Operation', 'UserId', 'AuditData', 'AssociatedAdminUnits', 'AssociatedAdminUnitsNames') } + $outputDirEmpty = Split-Path $OutputFile -Parent + if (-not (Test-Path $outputDirEmpty)) { New-Item -ItemType Directory -Path $outputDirEmpty -Force | Out-Null } + $enc = New-Object System.Text.UTF8Encoding($false) + $sw = [System.IO.StreamWriter]::new($OutputFile, $false, $enc) + $escapedCols = @() + foreach ($col in $headerColumns) { + $c = [string]$col + $needsQuote = ($c -match '[",\r\n]') -or $c.StartsWith(' ') -or $c.EndsWith(' ') + $escaped = $c -replace '"', '""' + if ($needsQuote) { $escaped = '"' + $escaped + '"' } + $escapedCols += , $escaped + } + $sw.WriteLine(($escapedCols -join ',')) + $sw.Flush(); $sw.Dispose() + Write-LogHost "Header-only CSV created at: $OutputFile" -ForegroundColor Green + } catch { + Write-LogHost "Failed to write header-only CSV for ${OutputFile}: $($_.Exception.Message)" -ForegroundColor Red + } + } + + # Emit header-only CSVs for per-activity split when zero rows exported + # Use the metric directly in case $totalExported wasn't set (resume mode with no new fetches) + if (-not $CombineOutput -and -not $ExportWorkbook -and ([int]$script:metrics.TotalStructuredRows -eq 0) -and $ActivityTypes) { + try { + $outputDir = Split-Path $OutputFile -Parent + $timestamp = [System.IO.Path]::GetFileNameWithoutExtension($OutputFile) -replace '.*_(\d{8}_\d{6}).*', '$1' + $headerColumns = if ($ExplodeDeep -or $ExplodeArrays -or $ForcedRawInputCsvExplosion) { $PurviewExplodedHeader } else { @('RecordId', 'CreationDate', 'RecordType', 'Operation', 'UserId', 'AuditData', 'AssociatedAdminUnits', 'AssociatedAdminUnitsNames') } + foreach ($actType in $ActivityTypes) { + $file = Join-Path $outputDir ("Purview_Audit_{0}_{1}.csv" -f $actType, $timestamp) + try { + $outputDirEmpty = Split-Path $file -Parent + if (-not (Test-Path $outputDirEmpty)) { New-Item -ItemType Directory -Path $outputDirEmpty -Force | Out-Null } + $enc = New-Object System.Text.UTF8Encoding($false) + $sw = [System.IO.StreamWriter]::new($file, $false, $enc) + $escapedCols = @() + foreach ($col in $headerColumns) { + $c = [string]$col + $needsQuote = ($c -match '[",\r\n]') -or $c.StartsWith(' ') -or $c.EndsWith(' ') + $escaped = $c -replace '"', '""' + if ($needsQuote) { $escaped = '"' + $escaped + '"' } + $escapedCols += , $escaped + } + $sw.WriteLine(($escapedCols -join ',')) + $sw.Flush(); $sw.Dispose() + Write-LogHost "Header-only CSV created at: $file" -ForegroundColor Green + } catch { + Write-LogHost "Failed to write header-only CSV for ${file}: $($_.Exception.Message)" -ForegroundColor Red + } + } + } catch { + Write-LogHost "Failed to emit per-activity header-only CSVs: $($_.Exception.Message)" -ForegroundColor Red + } + } + } else { + # -OnlyUserInfo mode: Show only EntraUsers file + $entraFile = (Join-Path (Split-Path $OutputFile -Parent) "EntraUsers_MAClicensing_${global:ScriptRunTimestamp}.csv") + if (Test-Path $entraFile) { + $entraSize = [math]::Round((Get-Item $entraFile).Length / 1KB, 2) + Write-LogHost "EntraUsers file: $entraFile" -ForegroundColor White + Write-LogHost "File size: $entraSize KB" -ForegroundColor White + } else { + Write-LogHost "EntraUsers file: $entraFile (not found)" -ForegroundColor Yellow + } + } + + Write-LogHost "Log file: $LogFile" -ForegroundColor White + + # Mark script as completed normally (used by finally block to detect Ctrl+C) + $script:ScriptCompleted = $true + + # Clean up incremental JSONL files from this run after successful completion + # CRITICAL: Must happen AFTER explosion completes, using timestamp to identify this run's files + # This avoids the issue where $script:PartialOutputPath is null after Complete-CheckpointRun + $incrementalDir = Join-Path (Split-Path $OutputFile -Parent) ".pax_incremental" + if (Test-Path $incrementalDir) { + $thisRunPattern = "*_${global:ScriptRunTimestamp}_*records.jsonl" + $thisRunFiles = Get-ChildItem -Path $incrementalDir -Filter $thisRunPattern -ErrorAction SilentlyContinue + if ($thisRunFiles -and $thisRunFiles.Count -gt 0) { + try { + $thisRunFiles | Remove-Item -Force -ErrorAction Stop + Write-LogHost "Incremental JSONL files cleaned up ($($thisRunFiles.Count) files from this run)" -ForegroundColor DarkGray + } catch { + Write-LogHost "Note: Could not remove incremental JSONL files: $($_.Exception.Message)" -ForegroundColor DarkGray + } + } + # Also remove the directory if it's now empty + $remaining = Get-ChildItem -Path $incrementalDir -ErrorAction SilentlyContinue + if (-not $remaining -or $remaining.Count -eq 0) { + try { + Remove-Item -Path $incrementalDir -Force -ErrorAction SilentlyContinue + } catch {} + } + } +} +catch { + # Handle Ctrl+C (PipelineStoppedException) + if ($_.Exception -is [System.Management.Automation.PipelineStoppedException] -or + $_.Exception.InnerException -is [System.Management.Automation.PipelineStoppedException]) { + $script:CtrlCPressed = $true + } + + $msg = $_.Exception.Message + if ($msg -eq '__PAX_EARLY_EXIT__' -or $script:EarlyExit) { + # Graceful early exit path (e.g., header-only CSV) + Write-LogHost "Early exit executed: $script:EarlyExit" -ForegroundColor DarkGray + } else { + Write-LogHost "Script failed: $msg" -ForegroundColor Red + Write-LogHost $_.ScriptStackTrace -ForegroundColor Red + } +} +finally { + # Check if script was interrupted (didn't complete normally and not an early exit) + if (-not $script:ScriptCompleted -and -not $script:EarlyExit -and -not $script:CtrlCPressed) { + # Script was interrupted - likely Ctrl+C that wasn't caught by PipelineStoppedException + $script:CtrlCPressed = $true + } + + # Show graceful exit message if interrupted (and not already shown by engine event handler) + # Skip in replay mode - no Graph connection to disconnect + if ($script:CtrlCPressed -and -not $env:PAX_GRACEFUL_EXIT_DONE -and -not $env:PAX_REPLAY_MODE) { + $env:PAX_GRACEFUL_EXIT_DONE = "1" # Prevent engine event handler from also showing message + Write-Host "" + Write-Host "============================================================================================================" -ForegroundColor Yellow + Write-Host " Script Interrupted - Performing Graceful Cleanup" -ForegroundColor Yellow + Write-Host "============================================================================================================" -ForegroundColor Yellow + Write-Host "" + + # Disconnect from Microsoft Graph - ALWAYS attempt disconnect + Write-Host " Disconnecting from Microsoft Graph..." -ForegroundColor Cyan + try { + Disconnect-MgGraph -ErrorAction Stop | Out-Null + Write-Host " Microsoft Graph disconnected" -ForegroundColor Green + } + catch { + if ($_.Exception.Message -match 'No application to sign out from') { + Write-Host " (Not connected to Microsoft Graph)" -ForegroundColor DarkGray + } else { + Write-Host " Microsoft Graph session cleared" -ForegroundColor Green + } + } + + # Disconnect from Exchange Online (if connected via EOM mode) + try { + $eomSession = Get-PSSession | Where-Object { $_.ConfigurationName -eq 'Microsoft.Exchange' -and $_.State -eq 'Opened' } + if ($eomSession) { + Write-Host " Disconnecting from Exchange Online Management..." -ForegroundColor Cyan + Disconnect-ExchangeOnline -Confirm:$false -ErrorAction SilentlyContinue | Out-Null + Write-Host " Exchange Online disconnected" -ForegroundColor Green + } + } + catch { + Write-Host " (Exchange Online cleanup completed)" -ForegroundColor Gray + } + + # Log the interruption + if ($LogFile -and (Test-Path $LogFile -ErrorAction SilentlyContinue)) { + try { + Write-Output "" | Out-File -FilePath $LogFile -Append -Encoding utf8 + Write-Output "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] Script interrupted by user (Ctrl+C)" | Out-File -FilePath $LogFile -Append -Encoding utf8 + } catch {} + } + + # Show checkpoint resume message if checkpoint is enabled + if ($script:CheckpointEnabled -and $script:CheckpointPath -and (Test-Path $script:CheckpointPath -ErrorAction SilentlyContinue)) { + Show-CheckpointExitMessage + } + + Write-Host "" + Write-Host " Cleanup complete. Exiting..." -ForegroundColor Green + Write-Host "" + exit 0 + } + + $endUtc = (Get-Date).ToUniversalTime() + if (-not $script:summaryWritten) { + try { if ($script:metrics -and $script:metrics.StartTime) { $startTail = $script:metrics.StartTime.ToUniversalTime().ToString('yyyy-MM-dd HH:mm:ss'); Write-Log "Script execution started at $startTail UTC" } } catch {} + Write-Log "Script execution completed at $($endUtc.ToString('yyyy-MM-dd HH:mm:ss')) UTC" + Write-Log "Script version: v$ScriptVersion" + try { if ($script:metrics -and $script:metrics.StartTime) { $elapsed = $endUtc - $script:metrics.StartTime; $totalHours = [math]::Floor($elapsed.TotalHours); $remainder = $elapsed - [TimeSpan]::FromHours($totalHours); $elapsedFormatted = ("{0}:{1:00}:{2:00}.{3:000}" -f $totalHours, $remainder.Minutes, $remainder.Seconds, $remainder.Milliseconds); Write-Log ("Total elapsed time: {0} (hours:minutes:seconds.milliseconds)" -f $elapsedFormatted) } } catch {} + $script:summaryWritten = $true + } + + # ALWAYS disconnect from Graph/EOM on script exit (completed, early exit, or error) + # This ensures credentials are cleared regardless of $script:Connected status + if (-not $UseEOM) { + # Graph API mode: Disconnect from Microsoft Graph + Write-LogHost "Disconnecting from Microsoft Graph..." -ForegroundColor Gray + try { + Disconnect-MgGraph -ErrorAction Stop | Out-Null + Write-LogHost " Microsoft Graph disconnected" -ForegroundColor Green + } + catch { + if ($_.Exception.Message -match 'No application to sign out from') { + Write-LogHost " (Not connected to Microsoft Graph)" -ForegroundColor DarkGray + } else { + Write-LogHost " Microsoft Graph session cleared" -ForegroundColor Green + } + } + } + + # EOM mode: Disconnect from Exchange Online + if ($UseEOM) { + try { + $eomSession = Get-PSSession | Where-Object { $_.ConfigurationName -eq 'Microsoft.Exchange' -and $_.State -eq 'Opened' } + if ($eomSession) { + Disconnect-ExchangeOnline -Confirm:$false -ErrorAction SilentlyContinue | Out-Null + Write-LogHost " Exchange Online disconnected" -ForegroundColor Green + } + } + catch {} + } + if ($EmitMetricsJson) { + try { + # Always timestamp metrics to prevent overwriting + if ($MetricsPath) { + $metricsPath = if ($MetricsPath.ToLower().EndsWith('.json')) { $MetricsPath } else { "$MetricsPath.json" } + } else { + $baseName = [System.IO.Path]::GetFileNameWithoutExtension($OutputFile) + $outputDir = Split-Path $OutputFile -Parent + $metricsPath = Join-Path $outputDir "${baseName}_metrics_$global:ScriptRunTimestamp.json" + } + $emitObj = [ordered]@{ version = $ScriptVersion; timestampUtc = (Get-Date).ToUniversalTime().ToString('o'); parameters = $paramSnapshot; metrics = $script:metrics } + ($emitObj | ConvertTo-Json -Depth 6) | Out-File -FilePath $metricsPath -Encoding UTF8 + Write-LogHost "Metrics JSON emitted: $metricsPath" -ForegroundColor DarkCyan + } catch { + Write-LogHost "Failed to emit metrics JSON: $($_.Exception.Message)" -ForegroundColor Yellow + } + } + + # NOTE: JSONL cleanup for successful runs is now handled at true script completion + # (after explosion, before this finally block) using timestamp-based file matching. + # This finally block only handles abnormal termination scenarios. + + $exitCode = 0; if ($script:circuitBreakerOpen) { $exitCode = 20 } elseif (($script:Hit10KLimit -or $script:Hit1MLimit) -and -not $AutoCompleteness) { $exitCode = 10 } + Write-LogHost "Exit code: $exitCode" -ForegroundColor DarkGray + exit $exitCode +} + + + + From ef6e075b0ff4426dc48d203ec88b46cf2dc4fdd3 Mon Sep 17 00:00:00 2001 From: Brian Middendorf Date: Mon, 9 Mar 2026 17:24:04 -0500 Subject: [PATCH 2/2] PAX-v1.0.20 --- README.md | 2 +- release_documentation/.gitkeep | 2 +- release_notes/.gitkeep | 2 +- script_archive/.gitkeep | 2 +- versions.json | 6 +++--- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index da8eea0..17bab07 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ This is an experimental script. On occasion, you may notice small deviations fro --- -> **🔍 Purview Audit Log Processor:** Download the script → [`PAX_Purview_Audit_Log_Processor_v1.10.7.ps1`](https://github.com/microsoft/PAX/releases/download/purview-v1.10.7/PAX_Purview_Audit_Log_Processor_v1.10.7.ps1) +> **🔍 Purview Audit Log Processor:** Download the script → [`PAX_Purview_Audit_Log_Processor_v1.10.8.ps1`](https://github.com/microsoft/PAX/releases/download/purview-v1.10.8/PAX_Purview_Audit_Log_Processor_v1.10.8.ps1) > > **📖 Resources:** [Latest Documentation](https://github.com/microsoft/PAX/blob/release/release_documentation/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_Documentation_v1.10.0.md) | [Latest Release Notes](https://github.com/microsoft/PAX/blob/release/release_notes/Purview_Audit_Log_Processor/PAX_Purview_Audit_Log_Processor_Release_Note_v1.10.0.md) > diff --git a/release_documentation/.gitkeep b/release_documentation/.gitkeep index 34d0ab6..22b6a34 100644 --- a/release_documentation/.gitkeep +++ b/release_documentation/.gitkeep @@ -1 +1 @@ -# Last updated: 2026-03-05 (PAX v1.0.19, Graph v1.0.1, Purview v1.10.7, CopilotInteractions v1.2.0) \ No newline at end of file +# Last updated: 2026-03-09 (PAX v1.0.20, Graph v1.0.1, Purview v1.10.8, CopilotInteractions v1.2.0) \ No newline at end of file diff --git a/release_notes/.gitkeep b/release_notes/.gitkeep index 34d0ab6..22b6a34 100644 --- a/release_notes/.gitkeep +++ b/release_notes/.gitkeep @@ -1 +1 @@ -# Last updated: 2026-03-05 (PAX v1.0.19, Graph v1.0.1, Purview v1.10.7, CopilotInteractions v1.2.0) \ No newline at end of file +# Last updated: 2026-03-09 (PAX v1.0.20, Graph v1.0.1, Purview v1.10.8, CopilotInteractions v1.2.0) \ No newline at end of file diff --git a/script_archive/.gitkeep b/script_archive/.gitkeep index 34d0ab6..22b6a34 100644 --- a/script_archive/.gitkeep +++ b/script_archive/.gitkeep @@ -1 +1 @@ -# Last updated: 2026-03-05 (PAX v1.0.19, Graph v1.0.1, Purview v1.10.7, CopilotInteractions v1.2.0) \ No newline at end of file +# Last updated: 2026-03-09 (PAX v1.0.20, Graph v1.0.1, Purview v1.10.8, CopilotInteractions v1.2.0) \ No newline at end of file diff --git a/versions.json b/versions.json index be7431f..e509b11 100644 --- a/versions.json +++ b/versions.json @@ -6,13 +6,13 @@ "products": { "pax": { "name": "PAX Infrastructure", - "version": "1.0.19", + "version": "1.0.20", "status": "development", "notes": "Core infrastructure, workflows, governance (iterating beyond last tagged release)" }, "purview": { "name": "Purview Audit Log Processor", - "version": "1.10.7", + "version": "1.10.8", "status": "development", "notes": "Canonical root script; prior versions in script_archive/Purview_Audit_Log_Processor." }, @@ -29,5 +29,5 @@ "notes": "Canonical root script; prior versions in script_archive/CopilotInteractions_Content_Audit_Log_Processor." } }, - "lastUpdated": "2026-03-05T00:00:00Z" + "lastUpdated": "2026-03-09T00:00:00Z" }