From 5e4acbc804d9942f1c2610ba0eb0aa385e37c648 Mon Sep 17 00:00:00 2001 From: Pursche Date: Sat, 25 Apr 2026 15:59:45 +0200 Subject: [PATCH 1/2] Make CanvasRenderer use IndirectDraw Merge Panel and Text into single renderpipeline Add CPU sorting of CanvasRenderer drawcalls Move CanvasRenderer to indirect drawcalls Considered using GPU sorting but CPU was faster Add sorting test to UI/Demo.luau Fix key mapping mixup in Input.luau Fix hardcoded Light color Add GPU RadixSort utils, currently unused Get rid of old GPU FFX ParallelSort --- .gitignore | 4 +- .../Game-Lib/ECS/Components/UI/Widget.h | 12 + .../Game-Lib/Game-Lib/ECS/Util/Transform2D.h | 31 +- Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.cpp | 58 ++ Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.h | 12 + .../Rendering/Canvas/CanvasRenderer.cpp | 741 +++++++++++------- .../Rendering/Canvas/CanvasRenderer.h | 109 ++- .../Game-Lib/Rendering/Util/RadixSort.cpp | 294 +++++++ .../Game-Lib/Rendering/Util/RadixSort.h | 113 +++ Source/Resources/Scripts/API/Input/Input.luau | 2 +- Source/Resources/Scripts/UI/Demo.luau | 415 +++++++++- .../Shaders/Include/Lighting.inc.slang | 3 - .../Sorting/FFX_ParallelSort.inc.slang | 454 ----------- .../Shaders/Sorting/Radix/Constants.inc.slang | 7 + .../Shaders/Sorting/Radix/Downsweep.cs.slang | 235 ++++++ .../Shaders/Sorting/Radix/Spine.cs.slang | 97 +++ .../Shaders/Sorting/Radix/Upsweep.cs.slang | 59 ++ .../Shaders/Sorting/SortCount.cs.slang | 40 - .../Shaders/Sorting/SortCountReduce.cs.slang | 33 - .../Shaders/Shaders/Sorting/SortScan.cs.slang | 35 - .../Shaders/Sorting/SortScanAdd.cs.slang | 45 -- .../Shaders/Sorting/SortScatter.cs.slang | 42 - .../SortSetupIndirectParameters.cs.slang | 45 -- Source/Shaders/Shaders/UI/Text.ps.slang | 112 --- Source/Shaders/Shaders/UI/Text.vs.slang | 67 -- .../UI/{Panel.ps.slang => Widget.ps.slang} | 167 ++-- .../UI/{Panel.vs.slang => Widget.vs.slang} | 26 +- 27 files changed, 1979 insertions(+), 1279 deletions(-) create mode 100644 Source/Game-Lib/Game-Lib/Rendering/Util/RadixSort.cpp create mode 100644 Source/Game-Lib/Game-Lib/Rendering/Util/RadixSort.h delete mode 100644 Source/Shaders/Shaders/Sorting/FFX_ParallelSort.inc.slang create mode 100644 Source/Shaders/Shaders/Sorting/Radix/Constants.inc.slang create mode 100644 Source/Shaders/Shaders/Sorting/Radix/Downsweep.cs.slang create mode 100644 Source/Shaders/Shaders/Sorting/Radix/Spine.cs.slang create mode 100644 Source/Shaders/Shaders/Sorting/Radix/Upsweep.cs.slang delete mode 100644 Source/Shaders/Shaders/Sorting/SortCount.cs.slang delete mode 100644 Source/Shaders/Shaders/Sorting/SortCountReduce.cs.slang delete mode 100644 Source/Shaders/Shaders/Sorting/SortScan.cs.slang delete mode 100644 Source/Shaders/Shaders/Sorting/SortScanAdd.cs.slang delete mode 100644 Source/Shaders/Shaders/Sorting/SortScatter.cs.slang delete mode 100644 Source/Shaders/Shaders/Sorting/SortSetupIndirectParameters.cs.slang delete mode 100644 Source/Shaders/Shaders/UI/Text.ps.slang delete mode 100644 Source/Shaders/Shaders/UI/Text.vs.slang rename Source/Shaders/Shaders/UI/{Panel.ps.slang => Widget.ps.slang} (50%) rename Source/Shaders/Shaders/UI/{Panel.vs.slang => Widget.vs.slang} (59%) diff --git a/.gitignore b/.gitignore index 80ee0a97..df413e5c 100644 --- a/.gitignore +++ b/.gitignore @@ -39,4 +39,6 @@ CMakeSettings.json # Exceptions .cache/ -*.patch \ No newline at end of file +*.patch +.claude/ +images/ \ No newline at end of file diff --git a/Source/Game-Lib/Game-Lib/ECS/Components/UI/Widget.h b/Source/Game-Lib/Game-Lib/ECS/Components/UI/Widget.h index 053fefa8..95b52931 100644 --- a/Source/Game-Lib/Game-Lib/ECS/Components/UI/Widget.h +++ b/Source/Game-Lib/Game-Lib/ECS/Components/UI/Widget.h @@ -39,6 +39,10 @@ namespace ECS::Components::UI WidgetFlags flags = WidgetFlags::Default; u32 worldTransformIndex = std::numeric_limits().max(); + // Packed draw-order sortkey computed by CanvasRenderer. See CanvasRenderer::DfsAssignSortKey for the layout. + // Sibling-order tiebreaker lives on SceneNode2D as siblingIndex (monotonic per-parent). + u32 sortKey = 0; + Scripting::UI::Widget* scriptWidget = nullptr; // Non mutable helper functions @@ -55,4 +59,12 @@ namespace ECS::Components::UI struct DirtyWidgetClipper {}; struct DirtyWidgetWorldTransformIndex {}; struct DestroyWidget {}; + + // Marks a canvas whose widget subtree needs its sortKeys recomputed by CanvasRenderer. + struct DirtyCanvasSort {}; + + // Registry-context singleton: set when the SET of canvases (or a canvas's layer) changes, + // so CanvasRenderer knows it needs to re-rank canvasOrder before re-running DfsAssignSortKey. + // Cleared inside CanvasRenderer::Update after RebuildCanvasOrder runs. + struct DirtyCanvasOrderFlag {}; } \ No newline at end of file diff --git a/Source/Game-Lib/Game-Lib/ECS/Util/Transform2D.h b/Source/Game-Lib/Game-Lib/ECS/Util/Transform2D.h index bc454e70..8e4042c4 100644 --- a/Source/Game-Lib/Game-Lib/ECS/Util/Transform2D.h +++ b/Source/Game-Lib/Game-Lib/ECS/Util/Transform2D.h @@ -287,8 +287,10 @@ namespace ECS::Components prevSibling->nextSibling = nextSibling; nextSibling->prevSibling = prevSibling; + // If we were the head of the list, the new head is the next sibling + // (which preserves insertion order: the second-inserted child becomes first). if (parent->firstChild == this) - parent->firstChild = prevSibling; + parent->firstChild = nextSibling; } nextSibling = nullptr; @@ -312,14 +314,20 @@ namespace ECS::Components } else { - //insert after the firstchild - nextSibling = newParent->firstChild->nextSibling; - prevSibling = newParent->firstChild; + // Append to the END of the circular sibling list (i.e. insert just before firstChild). + // This makes iteration order match insertion order, so siblings are drawn in the order they were created. + nextSibling = newParent->firstChild; + prevSibling = newParent->firstChild->prevSibling; prevSibling->nextSibling = this; nextSibling->prevSibling = this; } parent = newParent; + + // Assign a unique-within-current-siblings index. Using a monotonic counter on + // the parent rather than parent->children guarantees uniqueness even after + // detach+reattach cycles (where children decrements but nextSiblingIndex does not). + siblingIndex = newParent->nextSiblingIndex++; } //updates transform matrix of the children. does not recalculate matrix @@ -385,6 +393,21 @@ namespace ECS::Components SceneNode2D* nextSibling{}; SceneNode2D* prevSibling{}; i32 children{ 0 }; + + // Monotonic per-parent counter. Bumped each time a child is attached; used + // to assign a unique siblingIndex that never collides with concurrent siblings, + // even after detach/reattach cycles on the same parent. u32 so wraparound is + // irrelevant at any realistic UI churn rate. + u32 nextSiblingIndex{ 0 }; + // Unique index within this node's current parent. Set by SetParent. Used as + // the tiebreaker when two siblings have the same Z in the draw sort. + u32 siblingIndex{ 0 }; + + public: + u32 GetSiblingIndex() const + { + return siblingIndex; + } }; } diff --git a/Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.cpp b/Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.cpp index 16be20b0..d4fdc1e7 100644 --- a/Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.cpp +++ b/Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.cpp @@ -34,6 +34,42 @@ namespace ECS::Util { namespace UI { + entt::entity FindOwningCanvas(entt::registry* registry, entt::entity entity) + { + if (entity == entt::null) + return entt::null; + + auto* widget = registry->try_get(entity); + if (!widget) + return entt::null; + + if (widget->type == ECS::Components::UI::WidgetType::Canvas) + return entity; + + if (widget->scriptWidget) + return widget->scriptWidget->canvasEntity; + + return entt::null; + } + + void MarkCanvasSortDirty(entt::registry* registry, entt::entity canvasEntity) + { + if (canvasEntity == entt::null) + return; + registry->emplace_or_replace(canvasEntity); + } + + void MarkAllCanvasSortDirty(entt::registry* registry) + { + registry->view().each([&](entt::entity canvasEntity, auto&) + { + registry->emplace_or_replace(canvasEntity); + }); + // The canvas SET changed -> canvasOrder ranking is stale; gates the (relatively + // expensive) RebuildCanvasOrder pass next time CanvasRenderer::Update runs. + registry->ctx().emplace(); + } + entt::entity GetOrEmplaceCanvas(Scripting::UI::Widget*& widget, entt::registry* registry, const char* name, vec2 pos, ivec2 size, bool isRenderTexture) { ECS::Singletons::UISingleton& uiSingleton = registry->ctx().get(); @@ -109,6 +145,11 @@ namespace ECS::Util registry->emplace(entity); } + // A new canvas entering the system shifts canvasOrder for everyone; + // mark every canvas (including this one) so all widget sortKeys get their + // canvasOrder bits refreshed on the next CanvasRenderer::Update tick. + MarkAllCanvasSortDirty(registry); + return entity; } @@ -201,6 +242,9 @@ namespace ECS::Util eventInputInfo.onFocusEndEvent = panelTemplateComp.onFocusEndEvent; eventInputInfo.onFocusHeldEvent = panelTemplateComp.onFocusHeldEvent; + // New widget entering the tree -> owning canvas needs sort-key rebuild. + MarkCanvasSortDirty(registry, FindOwningCanvas(registry, parent)); + return entity; } @@ -285,6 +329,9 @@ namespace ECS::Util eventInputInfo.onFocusEndEvent = textTemplate.onFocusEndEvent; eventInputInfo.onFocusHeldEvent = textTemplate.onFocusHeldEvent; + // New widget entering the tree -> owning canvas needs sort-key rebuild. + MarkCanvasSortDirty(registry, FindOwningCanvas(registry, parent)); + return entity; } @@ -311,6 +358,9 @@ namespace ECS::Util widgetComp.type = ECS::Components::UI::WidgetType::Widget; widgetComp.scriptWidget = widget; + // New widget entering the tree -> owning canvas needs sort-key rebuild. + MarkCanvasSortDirty(registry, FindOwningCanvas(registry, parent)); + return entity; } @@ -319,6 +369,10 @@ namespace ECS::Util if (!registry->all_of(entity)) return false; + // Widgets leaving the tree changes the sibling set in their owning canvas. + // Mark it dirty BEFORE we mutate the scriptWidget or clear the parent, so FindOwningCanvas still resolves. + MarkCanvasSortDirty(registry, FindOwningCanvas(registry, entity)); + auto& transform2DSystem = Transform2DSystem::Get(*registry); transform2DSystem.ClearParent(entity); @@ -382,6 +436,10 @@ namespace ECS::Util CallLuaEvent(eventInputInfo->onFocusBeginEvent, Scripting::UI::UIInputEvent::FocusBegin, widget.scriptWidget); } } + + // Focus affects sortKey (priority bits), so both the previously focused and the newly focused widget's canvases need their sortKeys rebuilt. + MarkCanvasSortDirty(registry, FindOwningCanvas(registry, oldFocus)); + MarkCanvasSortDirty(registry, FindOwningCanvas(registry, entity)); } entt::entity GetFocusedWidgetEntity(entt::registry* registry) diff --git a/Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.h b/Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.h index e20e76da..a4756ae7 100644 --- a/Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.h +++ b/Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.h @@ -36,6 +36,18 @@ namespace ECS::Util void FocusWidgetEntity(entt::registry* registry, entt::entity entity); entt::entity GetFocusedWidgetEntity(entt::registry* registry); + // Returns the canvas entity that owns the given widget entity (the widget itself if it IS a canvas). + // Walks the scriptWidget->canvasEntity chain; returns entt::null if the entity has no Widget component. + entt::entity FindOwningCanvas(entt::registry* registry, entt::entity entity); + + // Mark a single canvas as needing its widget sort-keys recomputed (by CanvasRenderer::Update next frame). + // Safe to call with entt::null; becomes a no-op. + void MarkCanvasSortDirty(entt::registry* registry, entt::entity canvasEntity); + + // Mark every canvas in the registry as needing sort-keys recomputed. Used when the set of canvases itself + // changes (new canvas, canvas SetLayer) so that canvasOrder bits are refreshed everywhere. + void MarkAllCanvasSortDirty(entt::registry* registry); + void RefreshText(entt::registry* registry, entt::entity entity, std::string_view newText); void RefreshTemplate(entt::registry* registry, entt::entity entity, ECS::Components::UI::EventInputInfo& eventInputInfo); void RefreshClipper(entt::registry* registry, entt::entity entity); diff --git a/Source/Game-Lib/Game-Lib/Rendering/Canvas/CanvasRenderer.cpp b/Source/Game-Lib/Game-Lib/Rendering/Canvas/CanvasRenderer.cpp index c59f07c2..c5880977 100644 --- a/Source/Game-Lib/Game-Lib/Rendering/Canvas/CanvasRenderer.cpp +++ b/Source/Game-Lib/Game-Lib/Rendering/Canvas/CanvasRenderer.cpp @@ -27,13 +27,15 @@ #include #include +#include +#include + using namespace ECS::Components::UI; void CanvasRenderer::Clear() { _vertices.Clear(); - _panelDrawDatas.Clear(); - _charDrawDatas.Clear(); + _widgetDrawDatas.Clear(); _textureNameHashToIndex.clear(); _textureIDToIndex.clear(); @@ -44,8 +46,7 @@ CanvasRenderer::CanvasRenderer(Renderer::Renderer* renderer, GameRenderer* gameR : _renderer(renderer) , _gameRenderer(gameRenderer) , _debugRenderer(debugRenderer) - , _panelDescriptorSet(Renderer::DescriptorSetSlot::PER_PASS) - , _textDescriptorSet(Renderer::DescriptorSetSlot::PER_PASS) + , _widgetDescriptorSet(Renderer::DescriptorSetSlot::PER_PASS) { CreatePermanentResources(); } @@ -67,14 +68,31 @@ void CanvasRenderer::Update(f32 deltaTime) uiRegistry->view().each([&](entt::entity entity, Widget& widget) { if (widget.type == WidgetType::Canvas) + { + // RT canvases own retained GPU buffers (finalSortedArgs + finalCount). Destroy them + // and erase the bucket so we don't leak per-canvas allocations on dynamic UI churn. + // Non-RT canvases all share _mainBucket, which is process-lifetime and not freed here. + if (uiRegistry->all_of(entity)) + { + auto it = _rtBuckets.find(entity); + if (it != _rtBuckets.end()) + { + if (it->second.finalSortedArgs != Renderer::BufferID::Invalid()) + _renderer->QueueDestroyBuffer(it->second.finalSortedArgs); + if (it->second.finalCount != Renderer::BufferID::Invalid()) + _renderer->QueueDestroyBuffer(it->second.finalCount); + _rtBuckets.erase(it); + } + } return; + } if (widget.type == WidgetType::Panel) { auto& panel = uiRegistry->get(entity); if (panel.gpuDataIndex != -1) - _panelDrawDatas.Remove(panel.gpuDataIndex); + _widgetDrawDatas.Remove(panel.gpuDataIndex); if (panel.gpuVertexIndex != -1) _vertices.Remove(panel.gpuVertexIndex, 6); @@ -84,7 +102,7 @@ void CanvasRenderer::Update(f32 deltaTime) auto& text = uiRegistry->get(entity); if (text.gpuDataIndex != -1) - _charDrawDatas.Remove(text.gpuDataIndex, text.numCharsNonWhitespace); + _widgetDrawDatas.Remove(text.gpuDataIndex, text.numCharsNonWhitespace); if (text.gpuVertexIndex != -1) _vertices.Remove(text.gpuVertexIndex, text.numCharsNonWhitespace * 6); // * 6 because 6 vertices per char @@ -220,24 +238,57 @@ void CanvasRenderer::Update(f32 deltaTime) if (_vertices.SyncToGPU(_renderer)) { - _panelDescriptorSet.Bind("_vertices", _vertices.GetBuffer()); - _textDescriptorSet.Bind("_vertices", _vertices.GetBuffer()); + _widgetDescriptorSet.Bind("_vertices", _vertices.GetBuffer()); } - if (_panelDrawDatas.SyncToGPU(_renderer)) + if (_widgetDrawDatas.SyncToGPU(_renderer)) { - _panelDescriptorSet.Bind("_panelDrawDatas", _panelDrawDatas.GetBuffer()); + _widgetDescriptorSet.Bind("_widgetDrawDatas", _widgetDrawDatas.GetBuffer()); } - if (_charDrawDatas.SyncToGPU(_renderer)) + if (_widgetWorldPositions.SyncToGPU(_renderer)) { - _textDescriptorSet.Bind("_charDrawDatas", _charDrawDatas.GetBuffer()); + _widgetDescriptorSet.Bind("_widgetWorldPositions", _widgetWorldPositions.GetBuffer()); } - if (_widgetWorldPositions.SyncToGPU(_renderer)) + // Rebuild sort-keys + refresh dirty buckets in one combined pass. + // + // DirtyCanvasSort is set by every operation that changes a canvas's draw ORDER (widget + // create/destroy, focus change, reparent). DirtyCanvasOrderFlag is a registry-context + // singleton set when the canvas SET itself changed (canvas create/destroy/SetLayer); it + // gates the (relatively expensive) RebuildCanvasOrder pass. + // + // Bucket refresh is driven by DirtyCanvasSort -- NOT DirtyCanvasTag. DirtyCanvasTag fires + // for any visual mutation (color, text content, etc.) which doesn't require a re-sort; its + // only remaining job is gating which RT canvases get re-DRAWN by AddCanvasPass. { - _panelDescriptorSet.Bind("_widgetWorldPositions", _widgetWorldPositions.GetBuffer()); - _textDescriptorSet.Bind("_widgetWorldPositions", _widgetWorldPositions.GetBuffer()); + auto dirtySortView = uiRegistry->view(); + if (dirtySortView.begin() != dirtySortView.end()) + { + if (uiRegistry->ctx().contains()) + { + RebuildCanvasOrder(uiRegistry); + uiRegistry->ctx().erase(); + } + + bool mainBucketDirty = false; + dirtySortView.each([&](entt::entity canvasEntity, Canvas&) + { + u8 canvasOrder = _canvasOrderByEntity.at(canvasEntity); + u32 traversalIndex = 0; + u8 rootPriority = ResolvePriority(uiRegistry, canvasEntity); + DfsAssignSortKey(uiRegistry, canvasEntity, canvasOrder, traversalIndex, rootPriority); + + if (uiRegistry->all_of(canvasEntity)) + RefreshBucketCPU(uiRegistry, canvasEntity, /*isRT=*/true); + else + mainBucketDirty = true; + }); + if (mainBucketDirty) + RefreshBucketCPU(uiRegistry, entt::null, /*isRT=*/false); + + uiRegistry->clear(); + } } uiRegistry->clear(); @@ -264,13 +315,22 @@ void CanvasRenderer::UpdateWorldTransform(u32 index, const vec3& position) void CanvasRenderer::AddCanvasPass(Renderer::RenderGraph* renderGraph, RenderResources& resources, u8 frameIndex) { + // --- "Canvases" (graphics) ----------------------------------------------------------------- + // Per bucket, bind its retained finalSortedArgs + finalCount and issue one DrawIndirectCount. + // finalSortedArgs is populated CPU-side by RefreshBucketCPU via std::sort + UploadToBuffer; + // this pass just consumes it. struct Data { Renderer::ImageMutableResource target; + // Per-bucket buffer resources, in the same order as _drawBuckets below. Each element i + // corresponds to a {RT canvas or main} DrawIndirectCount call. + std::vector argBuffers; + std::vector countBuffers; + std::vector bucketCanvasEntities; // entt::null for the main bucket + Renderer::DescriptorSetResource globalDescriptorSet; - Renderer::DescriptorSetResource panelDescriptorSet; - Renderer::DescriptorSetResource textDescriptorSet; + Renderer::DescriptorSetResource widgetDescriptorSet; }; renderGraph->AddPass("Canvases", [this, &resources](Data& data, Renderer::RenderGraphBuilder& builder) // Setup @@ -283,13 +343,35 @@ void CanvasRenderer::AddCanvasPass(Renderer::RenderGraph* renderGraph, RenderRes builder.Read(_vertices.GetBuffer(), BufferUsage::GRAPHICS); - builder.Read(_panelDrawDatas.GetBuffer(), BufferUsage::GRAPHICS); - builder.Read(_charDrawDatas.GetBuffer(), BufferUsage::GRAPHICS); + builder.Read(_widgetDrawDatas.GetBuffer(), BufferUsage::GRAPHICS); builder.Read(_widgetWorldPositions.GetBuffer(), BufferUsage::GRAPHICS); + // Register each drawable bucket's retained final buffers. + entt::registry* registry = ServiceLocator::GetEnttRegistries()->uiRegistry; + + // RT canvases: only dirty ones draw this frame. + registry->view().each( + [&](entt::entity canvasEntity, Canvas&) + { + auto it = _rtBuckets.find(canvasEntity); + if (it == _rtBuckets.end() || it->second.drawCount == 0) + return; + BucketResources& b = it->second; + data.argBuffers.push_back(builder.Read(b.finalSortedArgs, BufferUsage::GRAPHICS)); + data.countBuffers.push_back(builder.Read(b.finalCount, BufferUsage::GRAPHICS)); + data.bucketCanvasEntities.push_back(canvasEntity); + }); + + // Main bucket: always drawn if non-empty. + if (_mainBucket.drawCount > 0) + { + data.argBuffers.push_back(builder.Read(_mainBucket.finalSortedArgs, BufferUsage::GRAPHICS)); + data.countBuffers.push_back(builder.Read(_mainBucket.finalCount, BufferUsage::GRAPHICS)); + data.bucketCanvasEntities.push_back(entt::null); + } + data.globalDescriptorSet = builder.Use(resources.globalDescriptorSet); - data.panelDescriptorSet = builder.Use(_panelDescriptorSet); - data.textDescriptorSet = builder.Use(_textDescriptorSet); + data.widgetDescriptorSet = builder.Use(_widgetDescriptorSet); return true;// Return true from setup to enable this pass, return false to disable it }, @@ -297,152 +379,82 @@ void CanvasRenderer::AddCanvasPass(Renderer::RenderGraph* renderGraph, RenderRes { GPU_SCOPED_PROFILER_ZONE(commandList, DebugRender2D); entt::registry* registry = ServiceLocator::GetEnttRegistries()->uiRegistry; - auto& transform2DSystem = ECS::Transform2DSystem::Get(*registry); - - Renderer::GraphicsPipelineID currentPipeline; - _lastRenderedWidgetType = WidgetType::None; - - // Loop over dirty rendertarget canvases - registry->view().each([&](auto entity, auto& canvas) - { - Renderer::TextureBaseDesc textureDesc = _renderer->GetDesc(canvas.renderTexture); - commandList.SetViewport(0, 0, static_cast(textureDesc.width), static_cast(textureDesc.height), 0.0f, 1.0f); - commandList.SetScissorRect(0, static_cast(textureDesc.width), 0, static_cast(textureDesc.height)); - - Renderer::TextureRenderPassDesc renderPassDesc; - renderPassDesc.renderTargets[0] = canvas.renderTexture; - renderPassDesc.clearRenderTargets[0] = true; - bool hasDrawn = false; - - // Loop over children recursively (depth first) - transform2DSystem.IterateChildrenRecursiveDepth(entity, [&, registry](auto childEntity) - { - auto& transform = registry->get(childEntity); - auto& childWidget = registry->get(childEntity); - - if (!childWidget.IsVisible()) - return false; // Skip invisible widgets - - if (childWidget.type == WidgetType::Canvas) - return true; // There is nothing to draw for a canvas - - if (!hasDrawn) - { - commandList.PushMarker("RT Canvas: " + canvas.name, Color::PastelOrange); - commandList.BeginRenderPass(renderPassDesc); - hasDrawn = true; - } - - if (ChangePipelineIfNecessary(commandList, currentPipeline, childWidget.type)) - { - if (childWidget.type == WidgetType::Panel) - { - commandList.BindDescriptorSet(data.panelDescriptorSet, frameIndex); - } - else if (childWidget.type == WidgetType::Text) - { - commandList.BindDescriptorSet(data.textDescriptorSet, frameIndex); - } - } - - if (childWidget.type == WidgetType::Panel) - { - auto& panel = registry->get(childEntity); - RenderPanel(commandList, transform, childWidget, panel); - } - else if (childWidget.type == WidgetType::Text) - { - auto& text = registry->get(childEntity); - if (text.numCharsNonWhitespace > 0) - { - RenderText(commandList, transform, childWidget, text); - } - } - - return true; - }); - - if (hasDrawn) - { - commandList.EndPipeline(currentPipeline); - commandList.EndRenderPass(renderPassDesc); - commandList.PopMarker(); - } - }); - - _lastRenderedWidgetType = WidgetType::None; vec2 renderSize = _renderer->GetRenderSize(); - commandList.SetViewport(0, 0, renderSize.x, renderSize.y, 0.0f, 1.0f); - commandList.SetScissorRect(0, static_cast(renderSize.x), 0, static_cast(renderSize.y)); - // Loop over regular canvases + // Single instance, used for the main bucket's BeginRenderPass during the loop AND for + // EndRenderPass after the loop. Avoids the previous "init this struct three separate + // times" dance. Renderer::RenderPassDesc mainRenderPassDesc; graphResources.InitializeRenderPassDesc(mainRenderPassDesc); mainRenderPassDesc.renderTargets[0] = data.target; - commandList.BeginRenderPass(mainRenderPassDesc); - registry->view(entt::exclude).each([&](auto entity, auto& canvas) + bool mainRenderPassOpen = false; + + for (size_t i = 0; i < data.bucketCanvasEntities.size(); ++i) { - bool hasDrawn = false; + entt::entity canvasEntity = data.bucketCanvasEntities[i]; + const bool isMain = (canvasEntity == entt::null); - // Loop over children recursively (depth first) - transform2DSystem.IterateChildrenRecursiveDepth(entity, [&, registry](auto childEntity) + u32 drawCount = 0; + if (isMain) { - auto& transform = registry->get(childEntity); - auto& childWidget = registry->get(childEntity); - - if (!childWidget.IsVisible()) - return false; // Skip invisible widgets - - if (childWidget.type == WidgetType::Canvas) - return true; // There is nothing to draw for a canvas - - if (!hasDrawn) - { - commandList.PushMarker("Canvas: " + canvas.name, Color::PastelOrange); - hasDrawn = true; - } - - if (ChangePipelineIfNecessary(commandList, currentPipeline, childWidget.type)) - { - commandList.BindDescriptorSet(data.globalDescriptorSet, frameIndex); - if (childWidget.type == WidgetType::Panel) - { - commandList.BindDescriptorSet(data.panelDescriptorSet, frameIndex); - } - else if (childWidget.type == WidgetType::Text) - { - commandList.BindDescriptorSet(data.textDescriptorSet, frameIndex); - } - } - - if (childWidget.type == WidgetType::Panel) - { - auto& panel = registry->get(childEntity); - RenderPanel(commandList, transform, childWidget, panel); - } - else if (childWidget.type == WidgetType::Text) - { - auto& text = registry->get(childEntity); - if (text.numCharsNonWhitespace > 0) - { - RenderText(commandList, transform, childWidget, text); - } - } - - return true; - }); + drawCount = _mainBucket.drawCount; + } + else + { + auto it = _rtBuckets.find(canvasEntity); + drawCount = (it == _rtBuckets.end()) ? 0 : it->second.drawCount; + } + if (drawCount == 0) + continue; - if (hasDrawn) + if (!isMain) { + auto& canvas = registry->get(canvasEntity); + Renderer::TextureBaseDesc textureDesc = _renderer->GetDesc(canvas.renderTexture); + commandList.SetViewport(0, 0, static_cast(textureDesc.width), static_cast(textureDesc.height), 0.0f, 1.0f); + commandList.SetScissorRect(0, static_cast(textureDesc.width), 0, static_cast(textureDesc.height)); + + Renderer::TextureRenderPassDesc renderPassDesc; + renderPassDesc.renderTargets[0] = canvas.renderTexture; + renderPassDesc.clearRenderTargets[0] = true; + + commandList.PushMarker("RT Canvas: " + canvas.name, Color::PastelOrange); + commandList.BeginRenderPass(renderPassDesc); + commandList.BeginPipeline(_widgetPipeline); + commandList.BindDescriptorSet(data.globalDescriptorSet, frameIndex); + commandList.BindDescriptorSet(data.widgetDescriptorSet, frameIndex); + commandList.DrawIndirectCount(data.argBuffers[i], 0, data.countBuffers[i], 0, drawCount); + commandList.EndPipeline(_widgetPipeline); + commandList.EndRenderPass(renderPassDesc); commandList.PopMarker(); } - }); + else + { + if (!mainRenderPassOpen) + { + commandList.SetViewport(0, 0, renderSize.x, renderSize.y, 0.0f, 1.0f); + commandList.SetScissorRect(0, static_cast(renderSize.x), 0, static_cast(renderSize.y)); + commandList.BeginRenderPass(mainRenderPassDesc); + mainRenderPassOpen = true; + } + commandList.BeginPipeline(_widgetPipeline); + commandList.BindDescriptorSet(data.globalDescriptorSet, frameIndex); + commandList.BindDescriptorSet(data.widgetDescriptorSet, frameIndex); + commandList.DrawIndirectCount(data.argBuffers[i], 0, data.countBuffers[i], 0, drawCount); + commandList.EndPipeline(_widgetPipeline); + } + } - if (_lastRenderedWidgetType != WidgetType::None) + // Always end the frame with the main render pass closed. If nothing got drawn into + // main (zero non-RT canvas draws), still open+close so downstream passes see a clean + // sceneColor attachment state. + if (!mainRenderPassOpen) { - commandList.EndPipeline(currentPipeline); + commandList.SetViewport(0, 0, renderSize.x, renderSize.y, 0.0f, 1.0f); + commandList.SetScissorRect(0, static_cast(renderSize.x), 0, static_cast(renderSize.y)); + commandList.BeginRenderPass(mainRenderPassDesc); } commandList.EndRenderPass(mainRenderPassDesc); @@ -459,8 +471,7 @@ void CanvasRenderer::CreatePermanentResources() textureArrayDesc.size = 4096; _textures = _renderer->CreateTextureArray(textureArrayDesc); - _panelDescriptorSet.Bind("_textures", _textures); - _textDescriptorSet.Bind("_textures", _textures); + _widgetDescriptorSet.Bind("_textures", _textures); Renderer::DataTextureDesc dataTextureDesc; dataTextureDesc.width = 1; @@ -491,8 +502,7 @@ void CanvasRenderer::CreatePermanentResources() samplerDesc.shaderVisibility = Renderer::ShaderVisibility::PIXEL; _sampler = _renderer->CreateSampler(samplerDesc); - _panelDescriptorSet.Bind("_sampler"_h, _sampler); - _textDescriptorSet.Bind("_sampler"_h, _sampler); + _widgetDescriptorSet.Bind("_sampler"_h, _sampler); textureArrayDesc.size = 256; _fontTextures = _renderer->CreateTextureArray(textureArrayDesc); @@ -500,16 +510,13 @@ void CanvasRenderer::CreatePermanentResources() _font = Renderer::Font::GetDefaultFont(_renderer); _renderer->AddTextureToArray(_font->GetTextureID(), _fontTextures); - _textDescriptorSet.Bind("_fontTextures"_h, _fontTextures); + _widgetDescriptorSet.Bind("_fontTextures"_h, _fontTextures); _vertices.SetDebugName("UIVertices"); _vertices.SetUsage(Renderer::BufferUsage::STORAGE_BUFFER); - _panelDrawDatas.SetDebugName("PanelDrawDatas"); - _panelDrawDatas.SetUsage(Renderer::BufferUsage::STORAGE_BUFFER); - - _charDrawDatas.SetDebugName("CharDrawDatas"); - _charDrawDatas.SetUsage(Renderer::BufferUsage::STORAGE_BUFFER); + _widgetDrawDatas.SetDebugName("WidgetDrawDatas"); + _widgetDrawDatas.SetUsage(Renderer::BufferUsage::STORAGE_BUFFER); _widgetWorldPositions.SetDebugName("WidgetWorldPositions"); _widgetWorldPositions.SetUsage(Renderer::BufferUsage::STORAGE_BUFFER); @@ -520,73 +527,40 @@ void CanvasRenderer::CreatePermanentResources() void CanvasRenderer::CreatePipelines() { - // Create pipelines + // Create the merged Widget pipeline Renderer::ImageFormat renderTargetFormat = _renderer->GetSwapChainImageFormat(); - { - Renderer::GraphicsPipelineDesc pipelineDesc; - - // Rasterizer state - pipelineDesc.states.rasterizerState.cullMode = Renderer::CullMode::BACK; + Renderer::GraphicsPipelineDesc pipelineDesc; - // Render targets. - pipelineDesc.states.renderTargetFormats[0] = renderTargetFormat; + // Rasterizer state + pipelineDesc.states.rasterizerState.cullMode = Renderer::CullMode::BACK; - // Shader - Renderer::VertexShaderDesc vertexShaderDesc; - vertexShaderDesc.shaderEntry = _gameRenderer->GetShaderEntry("UI/Panel.vs"_h, "UI/Panel.vs"); - pipelineDesc.states.vertexShader = _renderer->LoadShader(vertexShaderDesc); + // Render targets. + pipelineDesc.states.renderTargetFormats[0] = renderTargetFormat; - Renderer::PixelShaderDesc pixelShaderDesc; - pixelShaderDesc.shaderEntry = _gameRenderer->GetShaderEntry("UI/Panel.ps"_h, "UI/Panel.ps"); - pipelineDesc.states.pixelShader = _renderer->LoadShader(pixelShaderDesc); + // Shader + Renderer::VertexShaderDesc vertexShaderDesc; + vertexShaderDesc.shaderEntry = _gameRenderer->GetShaderEntry("UI/Widget.vs"_h, "UI/Widget.vs"); + pipelineDesc.states.vertexShader = _renderer->LoadShader(vertexShaderDesc); - // Blending - pipelineDesc.states.blendState.renderTargets[0].blendEnable = true; - pipelineDesc.states.blendState.renderTargets[0].srcBlend = Renderer::BlendMode::SRC_ALPHA; - pipelineDesc.states.blendState.renderTargets[0].destBlend = Renderer::BlendMode::INV_SRC_ALPHA; - pipelineDesc.states.blendState.renderTargets[0].srcBlendAlpha = Renderer::BlendMode::ONE; - pipelineDesc.states.blendState.renderTargets[0].destBlendAlpha = Renderer::BlendMode::INV_SRC_ALPHA; + Renderer::PixelShaderDesc pixelShaderDesc; + pixelShaderDesc.shaderEntry = _gameRenderer->GetShaderEntry("UI/Widget.ps"_h, "UI/Widget.ps"); + pipelineDesc.states.pixelShader = _renderer->LoadShader(pixelShaderDesc); - _panelPipeline = _renderer->CreatePipeline(pipelineDesc); - } - - { - Renderer::GraphicsPipelineDesc pipelineDesc; - - // Rasterizer state - pipelineDesc.states.rasterizerState.cullMode = Renderer::CullMode::BACK; - - // Render targets. - pipelineDesc.states.renderTargetFormats[0] = renderTargetFormat; - - // Shader - Renderer::VertexShaderDesc vertexShaderDesc; - vertexShaderDesc.shaderEntry = _gameRenderer->GetShaderEntry("UI/Text.vs"_h, "UI/Text.vs"); - pipelineDesc.states.vertexShader = _renderer->LoadShader(vertexShaderDesc); + // Blending + pipelineDesc.states.blendState.renderTargets[0].blendEnable = true; + pipelineDesc.states.blendState.renderTargets[0].srcBlend = Renderer::BlendMode::SRC_ALPHA; + pipelineDesc.states.blendState.renderTargets[0].destBlend = Renderer::BlendMode::INV_SRC_ALPHA; + pipelineDesc.states.blendState.renderTargets[0].srcBlendAlpha = Renderer::BlendMode::ONE; + pipelineDesc.states.blendState.renderTargets[0].destBlendAlpha = Renderer::BlendMode::INV_SRC_ALPHA; - Renderer::PixelShaderDesc pixelShaderDesc; - pixelShaderDesc.shaderEntry = _gameRenderer->GetShaderEntry("UI/Text.ps"_h, "UI/Text.ps"); - pipelineDesc.states.pixelShader = _renderer->LoadShader(pixelShaderDesc); - - // Blending - pipelineDesc.states.blendState.renderTargets[0].blendEnable = true; - pipelineDesc.states.blendState.renderTargets[0].srcBlend = Renderer::BlendMode::SRC_ALPHA; - pipelineDesc.states.blendState.renderTargets[0].destBlend = Renderer::BlendMode::INV_SRC_ALPHA; - pipelineDesc.states.blendState.renderTargets[0].srcBlendAlpha = Renderer::BlendMode::ONE; - pipelineDesc.states.blendState.renderTargets[0].destBlendAlpha = Renderer::BlendMode::INV_SRC_ALPHA; - - _textPipeline = _renderer->CreatePipeline(pipelineDesc); - } + _widgetPipeline = _renderer->CreatePipeline(pipelineDesc); } void CanvasRenderer::InitDescriptorSets() { - _panelDescriptorSet.RegisterPipeline(_renderer, _panelPipeline); - _panelDescriptorSet.Init(_renderer); - _textDescriptorSet.RegisterPipeline(_renderer, _textPipeline); - _textDescriptorSet.Init(_renderer); - + _widgetDescriptorSet.RegisterPipeline(_renderer, _widgetPipeline); + _widgetDescriptorSet.Init(_renderer); } void CanvasRenderer::UpdatePanelVertices(const vec2& clipPos, const vec2& clipSize, ECS::Components::UI::Panel& panel, ECS::Components::UI::PanelTemplate& panelTemplate) @@ -789,14 +763,16 @@ void CanvasRenderer::UpdatePanelData(entt::entity entity, ECS::Components::Trans // Add draw data if necessary if (panel.gpuDataIndex == -1) { - panel.gpuDataIndex = _panelDrawDatas.Add(); + panel.gpuDataIndex = _widgetDrawDatas.Add(); } vec2 size = transform.GetSize(); - vec2 cornerRadius = vec2(panelTemplate.cornerRadius / size.x, panelTemplate.cornerRadius /size.y); + vec2 cornerRadius = vec2(panelTemplate.cornerRadius / size.x, panelTemplate.cornerRadius / size.y); // Update draw data - auto& drawData = _panelDrawDatas[panel.gpuDataIndex]; - drawData.packed0.z = panelTemplate.color.ToRGBA32(); + auto& drawData = _widgetDrawDatas[panel.gpuDataIndex]; + drawData.packed0.x = static_cast(WidgetDrawType::Panel); + drawData.packed0.y = static_cast(panel.gpuVertexIndex); // vertexBase + drawData.packed1.z = panelTemplate.color.ToRGBA32(); drawData.cornerRadiusAndBorder = vec4(cornerRadius, 0.0f, 0.0f); // Update textures @@ -823,7 +799,7 @@ void CanvasRenderer::UpdatePanelData(entt::entity entity, ECS::Components::Trans additiveTextureIndex = LoadTexture(panelTemplate.foreground); } - drawData.packed0.x = (textureIndex & 0xFFFF) | ((additiveTextureIndex & 0xFFFF) << 16); + drawData.packed1.x = (textureIndex & 0xFFFF) | ((additiveTextureIndex & 0xFFFF) << 16); // Nine slicing const vec2& widgetSize = transform.GetSize(); @@ -833,7 +809,9 @@ void CanvasRenderer::UpdatePanelData(entt::entity entity, ECS::Components::Trans vec2 texSize = vec2(textureBaseDesc.width, textureBaseDesc.height); vec2 textureScaleToWidgetSize = texSize / widgetSize; - drawData.textureScaleToWidgetSize = hvec2(textureScaleToWidgetSize.x, textureScaleToWidgetSize.y); + hvec2 packedScale = hvec2(textureScaleToWidgetSize.x, textureScaleToWidgetSize.y); + static_assert(sizeof(hvec2) == sizeof(u32), "hvec2 must be 4 bytes for packed storage"); + std::memcpy(&drawData.packed1.w, &packedScale, sizeof(u32)); drawData.texCoord = vec4(panelTemplate.texCoords.min, panelTemplate.texCoords.max); drawData.slicingCoord = vec4(panelTemplate.nineSliceCoords.min, panelTemplate.nineSliceCoords.max); @@ -843,7 +821,7 @@ void CanvasRenderer::UpdatePanelData(entt::entity entity, ECS::Components::Trans // Get the correct clipper auto* clipper = ®istry->get(entity); BoundingRect* boundingRect = ®istry->get(entity); - + vec2 referenceSize = vec2(Renderer::Settings::UI_REFERENCE_WIDTH, Renderer::Settings::UI_REFERENCE_HEIGHT); vec2 clipRegionMin = clipper->clipRegionMin; vec2 clipRegionMax = clipper->clipRegionMax; @@ -859,16 +837,16 @@ void CanvasRenderer::UpdatePanelData(entt::entity entity, ECS::Components::Trans vec2 scaledClipMaskRegionMin = boundingRect->min / referenceSize; vec2 scaledClipMaskRegionMax = boundingRect->max / referenceSize; - - drawData.packed0.y = (clipper->hasClipMaskTexture) ? LoadTexture(clipper->clipMaskTexture) : 0; - drawData.clipRegionRect = vec4(clipRegionMin, clipRegionMax); - drawData.clipMaskRegionRect = vec4(scaledClipMaskRegionMin, scaledClipMaskRegionMax); - // World position UI + drawData.packed0.z = (clipper->hasClipMaskTexture) ? LoadTexture(clipper->clipMaskTexture) : 0; + drawData.clipRegionRect = hvec4(clipRegionMin.x, clipRegionMin.y, clipRegionMax.x, clipRegionMax.y); + drawData.clipMaskRegionRect = hvec4(scaledClipMaskRegionMin.x, scaledClipMaskRegionMin.y, scaledClipMaskRegionMax.x, scaledClipMaskRegionMax.y); + + // World position UI (UINT_MAX bit-pattern == -1 when reinterpreted as int in the shader) auto& widget = registry->get(entity); - drawData.worldPositionIndex = widget.worldTransformIndex; + drawData.packed0.w = widget.worldTransformIndex; - _panelDrawDatas.SetDirtyElement(panel.gpuDataIndex); + _widgetDrawDatas.SetDirtyElement(panel.gpuDataIndex); } void CanvasRenderer::UpdateTextData(entt::entity entity, Text& text, ECS::Components::UI::TextTemplate& textTemplate) @@ -903,10 +881,10 @@ void CanvasRenderer::UpdateTextData(entt::entity entity, Text& text, ECS::Compon // Add or update draw data if necessary if (text.gpuDataIndex == -1 || text.hasGrown) { - text.gpuDataIndex = _charDrawDatas.AddCount(text.numCharsNonWhitespace); + text.gpuDataIndex = _widgetDrawDatas.AddCount(text.numCharsNonWhitespace); } - // Update CharDrawData + // Update WidgetDrawData entries (one per non-whitespace char) Renderer::Font* font = Renderer::Font::GetFont(_renderer, textTemplate.font); Renderer::TextureID fontTextureID = font->GetTextureID(); @@ -964,70 +942,32 @@ void CanvasRenderer::UpdateTextData(entt::entity entity, Text& text, ECS::Compon continue; } - auto& drawData = _charDrawDatas[text.gpuDataIndex + charIndex]; - drawData.packed0.x = (fontTextureIndex & 0xFFFF) | ((charIndex & 0xFFFF) << 16); - drawData.packed0.z = textTemplate.color.ToRGBA32(); - drawData.packed0.w = textTemplate.borderColor.ToRGBA32(); - - drawData.packed1.x = textTemplate.borderSize; + auto& drawData = _widgetDrawDatas[text.gpuDataIndex + charIndex]; + drawData.packed0.x = static_cast(WidgetDrawType::Text); + drawData.packed0.y = static_cast(text.gpuVertexIndex) + (charIndex * 6); // vertexBase + drawData.packed1.x = (fontTextureIndex & 0xFFFF); + drawData.packed1.z = textTemplate.color.ToRGBA32(); + drawData.packed1.w = textTemplate.borderColor.ToRGBA32(); - // Unit range + // borderSize in cornerRadiusAndBorder.x; unitRange in .zw f32 distanceRange = font->upperPixelRange - font->lowerPixelRange; - drawData.packed1.z = distanceRange / font->width; - drawData.packed1.w = distanceRange / font->height; + drawData.cornerRadiusAndBorder.x = textTemplate.borderSize; + drawData.cornerRadiusAndBorder.y = 0.0f; + drawData.cornerRadiusAndBorder.z = distanceRange / font->width; + drawData.cornerRadiusAndBorder.w = distanceRange / font->height; // Clipping - drawData.packed0.y = (clipper->hasClipMaskTexture) ? LoadTexture(clipper->clipMaskTexture) : 0; - drawData.clipRegionRect = vec4(clipRegionMin, clipRegionMax); - drawData.clipMaskRegionRect = vec4(scaledClipMaskRegionMin, scaledClipMaskRegionMax); + drawData.packed0.z = (clipper->hasClipMaskTexture) ? LoadTexture(clipper->clipMaskTexture) : 0; + drawData.clipRegionRect = hvec4(clipRegionMin.x, clipRegionMin.y, clipRegionMax.x, clipRegionMax.y); + drawData.clipMaskRegionRect = hvec4(scaledClipMaskRegionMin.x, scaledClipMaskRegionMin.y, scaledClipMaskRegionMax.x, scaledClipMaskRegionMax.y); - // World position UI + // World position UI (UINT_MAX bit-pattern == -1 when reinterpreted as int in the shader) auto& widget = registry->get(entity); - drawData.worldPositionIndex = widget.worldTransformIndex; + drawData.packed0.w = widget.worldTransformIndex; charIndex++; } - _charDrawDatas.SetDirtyElements(text.gpuDataIndex, text.numCharsNonWhitespace); -} - -bool CanvasRenderer::ChangePipelineIfNecessary(Renderer::CommandList& commandList, Renderer::GraphicsPipelineID& currentPipeline, ECS::Components::UI::WidgetType widgetType) -{ - if (_lastRenderedWidgetType != widgetType) - { - if (_lastRenderedWidgetType != WidgetType::None) - { - commandList.EndPipeline(currentPipeline); - } - - _lastRenderedWidgetType = widgetType; - - if (widgetType == WidgetType::Panel) - { - currentPipeline = _panelPipeline; - } - else - { - currentPipeline = _textPipeline; - } - - commandList.BeginPipeline(currentPipeline); - return true; - } - return false; -} - -void CanvasRenderer::RenderPanel(Renderer::CommandList& commandList, ECS::Components::Transform2D& transform, Widget& widget, Panel& panel) -{ - commandList.PushMarker("Panel", Color::White); - commandList.Draw(6, 1, panel.gpuVertexIndex, panel.gpuDataIndex); - commandList.PopMarker(); -} - -void CanvasRenderer::RenderText(Renderer::CommandList& commandList, ECS::Components::Transform2D& transform, Widget& widget, Text& text) -{ - commandList.PushMarker("Text", Color::White); - commandList.Draw(6, text.numCharsNonWhitespace, text.gpuVertexIndex, text.gpuDataIndex); - commandList.PopMarker(); + _widgetDrawDatas.SetDirtyElements(text.gpuDataIndex, text.numCharsNonWhitespace); } vec2 CanvasRenderer::PixelPosToNDC(const vec2& pixelPosition, const vec2& screenSize) const @@ -1065,7 +1005,7 @@ u32 CanvasRenderer::LoadTexture(std::string_view path) // Use already loaded texture return _textureNameHashToIndex[textureNameHash]; } - + // Load texture Renderer::TextureDesc desc; desc.path = path; @@ -1076,3 +1016,228 @@ u32 CanvasRenderer::LoadTexture(std::string_view path) _textureNameHashToIndex[textureNameHash] = textureIndex; return textureIndex; } + +// ---------------------------------------------------------------------------- +// Sortkey layout (u32): +// MSB LSB +// [ priority 5 | canvasOrder 8 | traversalIndex 15 | reserved 4 ] +// +// - priority: 0 = normal, >0 = promoted (focus/drag/modal...). At the top so a dragged/focused +// widget floats above every normal widget, across canvases. +// - canvasOrder: 0 = bottom canvas, grows upward. Makes per-canvas sort a natural consequence +// of the sort key - no explicit grouping loop needed in the render pass. +// - traversalIndex: DFS pre-order index within a canvas, Z-sorted at each sibling level. +// Unique within the canvas. Encodes parent-before-child containment. Caps at 2^15-1 = 32,767 +// widgets per canvas; runaway counts are clamped so they never leak into canvasOrder bits. +// - reserved: for future use (clip bucket, atlas bucket, ...). +// ---------------------------------------------------------------------------- +u8 CanvasRenderer::ResolvePriority(entt::registry* registry, entt::entity entity) const +{ + auto& uiSingleton = registry->ctx().get(); + if (entity != entt::null && entity == uiSingleton.focusedEntity) + { + return 1; // Focus tier. Drag/modal/tooltip slots reserved for future systems. + } + return 0; +} + +void CanvasRenderer::RebuildCanvasOrder(entt::registry* registry) +{ + _canvasOrderByEntity.clear(); + + // Collect canvases + their layer. We iterate via registry->view so the natural + // ordering from entt is used as the tiebreaker when two canvases share a layer. + struct CanvasOrderEntry { entt::entity entity; u32 layer; u32 iterSeenIndex; }; + std::vector canvases; + + u32 iterSeenIndex = 0; + registry->view().each([&](entt::entity canvasEntity, Canvas&) + { + auto& transform = registry->get(canvasEntity); + canvases.push_back({ canvasEntity, transform.GetLayer(), iterSeenIndex++ }); + }); + + // Sort: layer asc, then iteration order asc. Unique keys by construction. + std::sort(canvases.begin(), canvases.end(), [](const CanvasOrderEntry& a, const CanvasOrderEntry& b) + { + if (a.layer != b.layer) + return a.layer < b.layer; + return a.iterSeenIndex < b.iterSeenIndex; + }); + + for (size_t i = 0; i < canvases.size(); ++i) + { + _canvasOrderByEntity[canvases[i].entity] = static_cast(std::min(i, 255)); + } +} + +void CanvasRenderer::DfsAssignSortKey(entt::registry* registry, entt::entity entity, u8 canvasOrder, u32& traversalIndex, u8 inheritedPriority) +{ + auto& transform2DSystem = ECS::Transform2DSystem::Get(*registry); + + auto& widget = registry->get(entity); + u8 effectivePriority = std::max(inheritedPriority, ResolvePriority(registry, entity)); + + // Canvases aren't drawn themselves, so we don't produce a sort key for them (they're iteration hubs). + if (widget.type != WidgetType::Canvas) + { + // u32 sortkey layout (MSB -> LSB): + // [ priority 5 | canvasOrder 8 | traversalIndex 15 | reserved 4 ] + // 32,768 widgets per canvas max; runaway scripts get clamped so the key never bleeds + // into the canvasOrder bits. In practice real UIs are well under 1000 widgets per canvas. + constexpr u32 kMaxTraversalIndex = (1u << 15) - 1; + const u32 clampedTraversal = std::min(traversalIndex, kMaxTraversalIndex); + widget.sortKey = (static_cast(effectivePriority) << 27) + | (static_cast(canvasOrder) << 19) + | (clampedTraversal << 4); + ++traversalIndex; + } + + // Gather children, sort by (Transform2D::layer asc, SceneNode2D::siblingIndex asc). Not stable_sort - + // the siblingIndex tiebreaker already guarantees a total order. + // + // Recursion-safe via stack discipline on the shared _siblingScratch: + // - record `start` before pushing this level's children, + // - sort only [start, end), + // - copy each child by VALUE before recursing (the recursive call will push more entries + // and may reallocate the underlying buffer; the by-value copy is unaffected), + // - resize back to `start` before returning so the caller's frame is intact. + const size_t start = _siblingScratch.size(); + transform2DSystem.IterateChildren(entity, [&](entt::entity childEntity) + { + _siblingScratch.push_back(childEntity); + }); + const size_t count = _siblingScratch.size() - start; + + std::sort(_siblingScratch.begin() + start, _siblingScratch.end(), [&](entt::entity a, entt::entity b) + { + const auto& ta = registry->get(a); + const auto& tb = registry->get(b); + if (ta.GetLayer() != tb.GetLayer()) + return ta.GetLayer() < tb.GetLayer(); + + const auto& na = registry->get(a); + const auto& nb = registry->get(b); + return na.GetSiblingIndex() < nb.GetSiblingIndex(); + }); + + for (size_t i = 0; i < count; ++i) + { + // By-value copy is mandatory: the recursive call will push to _siblingScratch and may + // reallocate the backing buffer, invalidating any reference into it. + const entt::entity child = _siblingScratch[start + i]; + DfsAssignSortKey(registry, child, canvasOrder, traversalIndex, effectivePriority); + } + + _siblingScratch.resize(start); +} + +void CanvasRenderer::RefreshBucketCPU(entt::registry* registry, entt::entity canvasEntity, bool isRT) +{ + ECS::Transform2DSystem& transformSystem2D = ECS::Transform2DSystem::Get(*registry); + + // Resolve target bucket (insert empty on first encounter for this RT canvas). + BucketResources* bucket = isRT ? &_rtBuckets.try_emplace(canvasEntity).first->second + : &_mainBucket; + + // --- Gather (sortKey, IndirectDraw) pairs into _sortScratch ------------------------------- + _sortScratch.clear(); + + auto gather = [&](entt::entity root) + { + transformSystem2D.IterateChildrenRecursiveDepth(root, [&](entt::entity childEntity) + { + auto& w = registry->get(childEntity); + if (!w.IsVisible()) + return false; + if (w.type != WidgetType::Panel && w.type != WidgetType::Text) + return true; + + Renderer::IndirectDraw args{}; + args.vertexCount = 6; + args.firstVertex = 0; + + if (w.type == WidgetType::Panel) + { + auto& panel = registry->get(childEntity); + if (panel.gpuDataIndex < 0) + return true; + args.instanceCount = 1; + args.firstInstance = static_cast(panel.gpuDataIndex); + } + else // Text + { + auto& text = registry->get(childEntity); + if (text.numCharsNonWhitespace <= 0 || text.gpuDataIndex < 0) + return true; + args.instanceCount = static_cast(text.numCharsNonWhitespace); + args.firstInstance = static_cast(text.gpuDataIndex); + } + + _sortScratch.push_back({ w.sortKey, args }); + return true; + }); + }; + + if (isRT) + { + gather(canvasEntity); + } + else + { + registry->view(entt::exclude).each([&](entt::entity c, Canvas&) + { + gather(c); + }); + } + + const u32 drawCount = static_cast(_sortScratch.size()); + bucket->drawCount = drawCount; + + if (drawCount == 0) + { + // Nothing to draw. Leave retained buffers as-is; the draw pass checks drawCount==0 and skips. + return; + } + + // --- Sort on CPU --------------------------------------------------------------------------- + // std::sort beats our GPU radix sort at UI scale (N up to a few thousand) because the GPU pipe + // is dispatch-overhead-bound regardless of N. See git history for the GPU path (RadixSort.*). + std::sort(_sortScratch.begin(), _sortScratch.end(), [](const SortEntry& a, const SortEntry& b) + { + return a.key < b.key; + }); + + // --- Extract sorted IndirectDraws into contiguous upload vector ---------------------------- + _uploadScratch.clear(); + _uploadScratch.reserve(drawCount); + for (const SortEntry& e : _sortScratch) + _uploadScratch.push_back(e.draw); + + // --- (Re)create retained finalSortedArgs / finalCount if needed ---------------------------- + if (bucket->finalSortedArgsCapacity < drawCount || bucket->finalSortedArgs == Renderer::BufferID::Invalid()) + { + Renderer::BufferDesc argsDesc; + argsDesc.name = isRT ? "UISort.RT.FinalSortedArgs" : "UISort.Main.FinalSortedArgs"; + argsDesc.usage = Renderer::BufferUsage::INDIRECT_ARGUMENT_BUFFER + | Renderer::BufferUsage::TRANSFER_DESTINATION; + argsDesc.size = static_cast(drawCount) * sizeof(Renderer::IndirectDraw); + bucket->finalSortedArgs = _renderer->CreateBuffer(bucket->finalSortedArgs, argsDesc); + bucket->finalSortedArgsCapacity = drawCount; + } + if (bucket->finalCount == Renderer::BufferID::Invalid()) + { + Renderer::BufferDesc countDesc; + countDesc.name = isRT ? "UISort.RT.FinalCount" : "UISort.Main.FinalCount"; + countDesc.usage = Renderer::BufferUsage::INDIRECT_ARGUMENT_BUFFER + | Renderer::BufferUsage::TRANSFER_DESTINATION; + countDesc.size = sizeof(u32); + bucket->finalCount = _renderer->CreateBuffer(countDesc); + } + + // --- Upload -------------------------------------------------------------------------------- + // UploadToBuffer queues a staged copy that completes before the next frame's command list + // runs. Same mechanism we already use for everything else here. + _renderer->UploadToBuffer(bucket->finalSortedArgs, 0, _uploadScratch.data(), 0, static_cast(drawCount) * sizeof(Renderer::IndirectDraw)); + _renderer->UploadToBuffer(bucket->finalCount, 0, &bucket->drawCount, 0, sizeof(u32)); +} diff --git a/Source/Game-Lib/Game-Lib/Rendering/Canvas/CanvasRenderer.h b/Source/Game-Lib/Game-Lib/Rendering/Canvas/CanvasRenderer.h index ea0b5bc4..0dd718aa 100644 --- a/Source/Game-Lib/Game-Lib/Rendering/Canvas/CanvasRenderer.h +++ b/Source/Game-Lib/Game-Lib/Rendering/Canvas/CanvasRenderer.h @@ -8,9 +8,14 @@ #include #include #include +#include + +#include #include +#include + namespace Renderer { class RenderGraph; @@ -65,42 +70,51 @@ class CanvasRenderer void UpdatePanelData(entt::entity entity, ECS::Components::Transform2D& transform, ECS::Components::UI::Panel& panel, ECS::Components::UI::PanelTemplate& panelTemplate); void UpdateTextData(entt::entity entity, ECS::Components::UI::Text& text, ECS::Components::UI::TextTemplate& textTemplate); - bool ChangePipelineIfNecessary(Renderer::CommandList& commandList, Renderer::GraphicsPipelineID& currentPipeline, ECS::Components::UI::WidgetType widgetType); - void RenderPanel(Renderer::CommandList& commandList, ECS::Components::Transform2D& transform, ECS::Components::UI::Widget& widget, ECS::Components::UI::Panel& panel); - void RenderText(Renderer::CommandList& commandList, ECS::Components::Transform2D& transform, ECS::Components::UI::Widget& widget, ECS::Components::UI::Text& text); - vec2 PixelPosToNDC(const vec2& pixelPosition, const vec2& screenSize) const; vec2 PixelSizeToNDC(const vec2& pixelPosition, const vec2& screenSize) const; u32 AddTexture(Renderer::TextureID textureID); u32 LoadTexture(std::string_view path); - -private: - struct PanelDrawData + // --- Sortkey machinery (see CanvasRenderer.cpp for bit layout) ----------------- + // Resolves the effective priority for this widget (0 = normal, >0 = promoted for focus/drag/etc). + u8 ResolvePriority(entt::registry* registry, entt::entity entity) const; + + // Rebuilds the mapping from canvas entity to its 8-bit canvasOrder, based on canvas layer + registry iteration. + void RebuildCanvasOrder(entt::registry* registry); + + // Walks a canvas's subtree depth-first, writing sortKey to each Widget component. Siblings are sorted by + // (Transform2D::layer asc, SceneNode2D::siblingIndex asc) before recursion so the order is deterministic + // and every produced sortKey is unique. + void DfsAssignSortKey(entt::registry* registry, entt::entity entity, u8 canvasOrder, u32& traversalIndex, u8 inheritedPriority); + + // Gather + sort + upload for one render-pass bucket. Walks the canvas subtree(s), filters + // visible Panel/Text entries into _sortScratch, std::sorts by sortKey, copies the sorted + // IndirectDraws into _uploadScratch, and queues a CPU->GPU upload to the bucket's retained + // finalSortedArgs + finalCount. + // + // canvasEntity == entt::null signals "main bucket" (every non-RT canvas merged). + void RefreshBucketCPU(entt::registry* registry, entt::entity canvasEntity, bool isRT); + + +public: + enum class WidgetDrawType : u32 { - public: - uvec3 packed0; // x: textureIndex & additiveTextureIndex, y: clipMaskTextureIndex, z: color - hvec2 textureScaleToWidgetSize = hvec2(0.0f, 0.0f); - vec4 texCoord; // uv - vec4 slicingCoord; // uv - //vec4 color; // xyz: color, w: unused - vec4 cornerRadiusAndBorder; // xy: cornerRadius, zw: border - hvec4 clipRegionRect = hvec4(0.0f, 0.0f, 1.0f, 1.0f); // xy: min, zw: max - hvec4 clipMaskRegionRect = hvec4(0.0f, 0.0f, 1.0f, 1.0f); // xy: min, zw: max - i32 worldPositionIndex = -1; - i32 padding[3]; + Panel = 0, + Text = 1, }; - struct CharDrawData +private: + struct WidgetDrawData { public: - uvec4 packed0; // x: textureIndex & clipMaskTextureIndex, y: charIndex, z: textColor, w: borderColor - vec4 packed1; // x: borderSize, y: padding, zw: unitRangeXY - hvec4 clipRegionRect = hvec4(0.0f, 0.0f, 1.0f, 1.0f); // xy: min, zw: max + uvec4 packed0 = uvec4(0, 0, 0, 0xFFFFFFFFu); // x: type, y: vertexBase, z: clipMaskTextureIndex, w: worldPositionIndex (i32 reinterpret as -1) + uvec4 packed1 = uvec4(0, 0, 0, 0); // Panel: x: textureIndex|additiveTextureIndex, z: color, w: textureScaleToWidgetSize (half2). Text: x: fontTextureIndex, z: textColor, w: borderColor + vec4 texCoord = vec4(0.0f); // Panel only + vec4 slicingCoord = vec4(0.0f); // Panel only + vec4 cornerRadiusAndBorder = vec4(0.0f); // Panel: xy: cornerRadius. Text: x: borderSize, zw: unitRange + hvec4 clipRegionRect = hvec4(0.0f, 0.0f, 1.0f, 1.0f); // xy: min, zw: max hvec4 clipMaskRegionRect = hvec4(0.0f, 0.0f, 1.0f, 1.0f); // xy: min, zw: max - i32 worldPositionIndex; - i32 padding[3]; }; private: @@ -109,12 +123,10 @@ class CanvasRenderer DebugRenderer* _debugRenderer; Renderer::GPUVector _vertices; - Renderer::GPUVector _panelDrawDatas; - - Renderer::GPUVector _charDrawDatas; + Renderer::GPUVector _widgetDrawDatas; Renderer::GPUVector _widgetWorldPositions; - + Renderer::Font* _font; Renderer::SamplerID _sampler; Renderer::TextureArrayID _textures; @@ -124,11 +136,42 @@ class CanvasRenderer Renderer::TextureArrayID _fontTextures; robin_hood::unordered_map _textureIDToFontTexturesIndex; - Renderer::GraphicsPipelineID _panelPipeline; - Renderer::GraphicsPipelineID _textPipeline; + Renderer::GraphicsPipelineID _widgetPipeline; + + Renderer::DescriptorSet _widgetDescriptorSet; + + // --- Sortkey state ------------------------------------------------------------ + // Assigned canvasOrder (0..255) per canvas entity, refreshed by RebuildCanvasOrder when + // the canvas SET changes (gated on DirtyCanvasOrderFlag). Read by DfsAssignSortKey to bake + // canvasOrder into each widget's sortKey. + robin_hood::unordered_map _canvasOrderByEntity; + + // Shared scratch for DfsAssignSortKey. Each recursion level appends its children's entities + // to the tail, sorts only its own [start, end) range, recurses by-value, and resizes back + // to its start before returning. Net: zero allocations after the first warmup. + std::vector _siblingScratch; + + // --- Per-bucket retained indirect-draw state ---------------------------------- + // One BucketResources per render-pass bucket: one per RT canvas that has ever existed, + // plus one static _mainBucket for all non-RT canvases merged together. finalSortedArgs + // is retained across frames; it's CPU-sorted and uploaded only when the bucket is dirty, + // and consumed as-is by DrawIndirectCount every frame. + struct BucketResources + { + Renderer::BufferID finalSortedArgs = Renderer::BufferID::Invalid(); + u32 finalSortedArgsCapacity = 0; + u32 drawCount = 0; + + // Single-element u32 count buffer for DrawIndirectCount. + Renderer::BufferID finalCount = Renderer::BufferID::Invalid(); + }; - Renderer::DescriptorSet _panelDescriptorSet; - Renderer::DescriptorSet _textDescriptorSet; + robin_hood::unordered_map _rtBuckets; // key: RT canvas entity + BucketResources _mainBucket; - ECS::Components::UI::WidgetType _lastRenderedWidgetType = ECS::Components::UI::WidgetType::None; + // CPU scratch for gather+sort+upload inside RefreshBucketCPU. Reused across refreshes; + // `.clear()` preserves capacity. + struct SortEntry { u32 key; Renderer::IndirectDraw draw; }; + std::vector _sortScratch; + std::vector _uploadScratch; }; \ No newline at end of file diff --git a/Source/Game-Lib/Game-Lib/Rendering/Util/RadixSort.cpp b/Source/Game-Lib/Game-Lib/Rendering/Util/RadixSort.cpp new file mode 100644 index 00000000..9afe8903 --- /dev/null +++ b/Source/Game-Lib/Game-Lib/Rendering/Util/RadixSort.cpp @@ -0,0 +1,294 @@ +#include "RadixSort.h" + +#include "Game-Lib/Rendering/GameRenderer.h" + +#include +#include +#include + +#include + +// Mirror the shader-side constants in Source/Shaders/Shaders/Sorting/Radix/Constants.inc.slang. +static constexpr u32 RADIX = 256; +static constexpr u32 WORKGROUP_SIZE = 512; +static constexpr u32 PARTITION_DIVISION = 8; +static constexpr u32 PARTITION_SIZE = PARTITION_DIVISION * WORKGROUP_SIZE; // 4096 +static constexpr u32 NUM_RADIX_PASSES = 4; // 4 passes * 8 bits = 32-bit key + +RadixSort::RadixSort() + : _upsweepFromPingSet(Renderer::DescriptorSetSlot::PER_PASS) + , _upsweepFromPongSet(Renderer::DescriptorSetSlot::PER_PASS) + , _spineSet(Renderer::DescriptorSetSlot::PER_PASS) + , _downsweepPingToPongSet(Renderer::DescriptorSetSlot::PER_PASS) + , _downsweepPongToPingSet(Renderer::DescriptorSetSlot::PER_PASS) +{ +} + +void RadixSort::Init(Renderer::Renderer* renderer, GameRenderer* gameRenderer, u32 maxKeyCount) +{ + _renderer = renderer; + _gameRenderer = gameRenderer; + + CreatePipelines(); + AllocateFixedScratch(); + + _upsweepFromPingSet.RegisterPipeline(_renderer, _upsweepPipeline); + _upsweepFromPingSet.Init(_renderer); + _upsweepFromPongSet.RegisterPipeline(_renderer, _upsweepPipeline); + _upsweepFromPongSet.Init(_renderer); + + _spineSet.RegisterPipeline(_renderer, _spinePipeline); + _spineSet.Init(_renderer); + + _downsweepPingToPongSet.RegisterPipeline(_renderer, _downsweepPipeline); + _downsweepPingToPongSet.Init(_renderer); + _downsweepPongToPingSet.RegisterPipeline(_renderer, _downsweepPipeline); + _downsweepPongToPingSet.Init(_renderer); + + AllocateKeyCountScratch(maxKeyCount); + BindAllDescriptorSets(); + + _initialized = true; +} + +void RadixSort::CreatePipelines() +{ + // Upsweep + Spine: single permutation. + auto loadNoPermutation = [&](const char* shaderPath, Renderer::ComputePipelineID& out, const char* debugName) + { + Renderer::ComputeShaderDesc shaderDesc; + shaderDesc.shaderEntry = _gameRenderer->GetShaderEntry( + Renderer::GetShaderEntryNameHash(shaderPath, {}), + shaderPath); + + Renderer::ComputePipelineDesc pipelineDesc; + pipelineDesc.debugName = debugName; + pipelineDesc.computeShader = _renderer->LoadShader(shaderDesc); + out = _renderer->CreatePipeline(pipelineDesc); + }; + + loadNoPermutation("Sorting/Radix/Upsweep.cs", _upsweepPipeline, "RadixSort.Upsweep"); + loadNoPermutation("Sorting/Radix/Spine.cs", _spinePipeline, "RadixSort.Spine"); + + // Downsweep: compile with KEY_VALUE=1 permutation (we always sort key+value pairs). + { + std::vector permutation = { + { "KEY_VALUE", "1" } + }; + const char* shaderPath = "Sorting/Radix/Downsweep.cs"; + + Renderer::ComputeShaderDesc shaderDesc; + shaderDesc.shaderEntry = _gameRenderer->GetShaderEntry( + Renderer::GetShaderEntryNameHash(shaderPath, permutation), + shaderPath); + + Renderer::ComputePipelineDesc pipelineDesc; + pipelineDesc.debugName = "RadixSort.Downsweep.KV"; + pipelineDesc.computeShader = _renderer->LoadShader(shaderDesc); + _downsweepPipeline = _renderer->CreatePipeline(pipelineDesc); + } +} + +void RadixSort::AllocateFixedScratch() +{ + // globalHistogram: one u32[256] per radix pass = 4 * 256 * 4 bytes = 4 KiB. Zeroed by + // FillBuffer at the start of each sort (globalHistogram is accumulated then scanned per pass). + Renderer::BufferDesc desc; + desc.name = "RadixSort.GlobalHistogram"; + desc.usage = Renderer::BufferUsage::STORAGE_BUFFER | Renderer::BufferUsage::TRANSFER_DESTINATION; + desc.size = NUM_RADIX_PASSES * RADIX * sizeof(u32); + _globalHistogram = _renderer->CreateBuffer(desc); +} + +void RadixSort::AllocateKeyCountScratch(u32 newMaxKeyCount) +{ + const u8 sortScratchUsage = Renderer::BufferUsage::STORAGE_BUFFER + | Renderer::BufferUsage::TRANSFER_DESTINATION; + const u8 identityUsage = Renderer::BufferUsage::STORAGE_BUFFER + | Renderer::BufferUsage::TRANSFER_SOURCE; + + Renderer::BufferDesc desc; + desc.usage = sortScratchUsage; + desc.size = static_cast(newMaxKeyCount) * sizeof(u32); + + desc.name = "RadixSort.SortKeys"; + _sortKeys = _renderer->CreateBuffer(_sortKeys, desc); + + desc.name = "RadixSort.WriteKeys"; + _writeKeys = _renderer->CreateBuffer(_writeKeys, desc); + + desc.name = "RadixSort.SortValues"; + _sortValues = _renderer->CreateBuffer(_sortValues, desc); + + desc.name = "RadixSort.WriteValues"; + _writeValues = _renderer->CreateBuffer(_writeValues, desc); + + // partitionHistogram: u32[maxPartitions * 256]. Written by upsweep, scanned by spine, read by + // downsweep. No TRANSFER usage needed (never copied to/from). + const u32 maxPartitions = (newMaxKeyCount + PARTITION_SIZE - 1) / PARTITION_SIZE; + desc.name = "RadixSort.PartitionHistogram"; + desc.usage = Renderer::BufferUsage::STORAGE_BUFFER; + desc.size = static_cast(std::max(maxPartitions, 1u)) * RADIX * sizeof(u32); + _partitionHistogram = _renderer->CreateBuffer(_partitionHistogram, desc); + + // Identity values buffer: [0, 1, 2, ..., newMaxKeyCount-1]. CopyBuffer source only. + desc.name = "RadixSort.IdentityValues"; + desc.usage = identityUsage; + desc.size = static_cast(newMaxKeyCount) * sizeof(u32); + _identityValues = _renderer->CreateAndFillBuffer(_identityValues, desc, + [newMaxKeyCount](void* mapped, size_t) { + u32* p = static_cast(mapped); + for (u32 i = 0; i < newMaxKeyCount; ++i) + p[i] = i; + }); + + _maxKeyCount = newMaxKeyCount; +} + +void RadixSort::BindAllDescriptorSets() +{ + // Binding numbers match the shader's [[vk::binding(N, PER_PASS)]]. Binding 0 is not used -- + // we dropped the elementCounts buffer in favour of a push constant. + _upsweepFromPingSet.Bind("globalHistogram"_h, _globalHistogram); + _upsweepFromPingSet.Bind("partitionHistogram"_h, _partitionHistogram); + _upsweepFromPingSet.Bind("keys"_h, _sortKeys); + + _upsweepFromPongSet.Bind("globalHistogram"_h, _globalHistogram); + _upsweepFromPongSet.Bind("partitionHistogram"_h, _partitionHistogram); + _upsweepFromPongSet.Bind("keys"_h, _writeKeys); + + _spineSet.Bind("globalHistogram"_h, _globalHistogram); + _spineSet.Bind("partitionHistogram"_h, _partitionHistogram); + + // Downsweep ping->pong: read sortKeys/sortValues, write writeKeys/writeValues. + _downsweepPingToPongSet.Bind("globalHistogram"_h, _globalHistogram); + _downsweepPingToPongSet.Bind("partitionHistogram"_h, _partitionHistogram); + _downsweepPingToPongSet.Bind("keysIn"_h, _sortKeys); + _downsweepPingToPongSet.Bind("keysOut"_h, _writeKeys); + _downsweepPingToPongSet.Bind("valuesIn"_h, _sortValues); + _downsweepPingToPongSet.Bind("valuesOut"_h, _writeValues); + + // Downsweep pong->ping: the reverse. + _downsweepPongToPingSet.Bind("globalHistogram"_h, _globalHistogram); + _downsweepPongToPingSet.Bind("partitionHistogram"_h, _partitionHistogram); + _downsweepPongToPingSet.Bind("keysIn"_h, _writeKeys); + _downsweepPongToPingSet.Bind("keysOut"_h, _sortKeys); + _downsweepPongToPingSet.Bind("valuesIn"_h, _writeValues); + _downsweepPongToPingSet.Bind("valuesOut"_h, _sortValues); +} + +void RadixSort::EnsureCapacity(u32 requiredMaxKeyCount) +{ + if (!_initialized || requiredMaxKeyCount <= _maxKeyCount) + return; + + const u32 newCap = std::max(_maxKeyCount * 2, requiredMaxKeyCount); + + AllocateKeyCountScratch(newCap); + + // Rebind every descriptor set that references the resized buffers. Safe here because the + // caller must invoke EnsureCapacity CPU-side (outside any render-graph execute) -- the + // previous frame's command list either isn't submitted yet or has already released the old + // IDs (deferred-destroyed via QueueDestroyBuffer inside CreateBuffer(existing, desc)). + _upsweepFromPingSet.Bind("partitionHistogram"_h, _partitionHistogram); + _upsweepFromPingSet.Bind("keys"_h, _sortKeys); + + _upsweepFromPongSet.Bind("partitionHistogram"_h, _partitionHistogram); + _upsweepFromPongSet.Bind("keys"_h, _writeKeys); + + _spineSet.Bind("partitionHistogram"_h, _partitionHistogram); + + _downsweepPingToPongSet.Bind("partitionHistogram"_h, _partitionHistogram); + _downsweepPingToPongSet.Bind("keysIn"_h, _sortKeys); + _downsweepPingToPongSet.Bind("keysOut"_h, _writeKeys); + _downsweepPingToPongSet.Bind("valuesIn"_h, _sortValues); + _downsweepPingToPongSet.Bind("valuesOut"_h, _writeValues); + + _downsweepPongToPingSet.Bind("partitionHistogram"_h, _partitionHistogram); + _downsweepPongToPingSet.Bind("keysIn"_h, _writeKeys); + _downsweepPongToPingSet.Bind("keysOut"_h, _sortKeys); + _downsweepPongToPingSet.Bind("valuesIn"_h, _writeValues); + _downsweepPongToPingSet.Bind("valuesOut"_h, _sortValues); +} + +RadixSort::PassResources RadixSort::RegisterPass(Renderer::RenderGraphBuilder& builder) +{ + using BufferUsage = Renderer::BufferPassUsage; + + PassResources res; + + // sortKeys/sortValues are CopyBuffer destinations THEN compute read/write within the sort, + // so they need TRANSFER | COMPUTE. + res.sortKeys = builder.Write(_sortKeys, BufferUsage::TRANSFER | BufferUsage::COMPUTE); + res.sortValues = builder.Write(_sortValues, BufferUsage::TRANSFER | BufferUsage::COMPUTE); + builder.Write(_writeKeys, BufferUsage::COMPUTE); + builder.Write(_writeValues, BufferUsage::COMPUTE); + res.identityValues = builder.Read(_identityValues, BufferUsage::TRANSFER); + + // globalHistogram is zeroed via FillBuffer (TRANSFER) then read/written by compute. + res.globalHistogram = builder.Write(_globalHistogram, BufferUsage::TRANSFER | BufferUsage::COMPUTE); + builder.Write(_partitionHistogram, BufferUsage::COMPUTE); + + res.upsweepFromPing = builder.Use(_upsweepFromPingSet); + res.upsweepFromPong = builder.Use(_upsweepFromPongSet); + res.spine = builder.Use(_spineSet); + res.downsweepPingToPong = builder.Use(_downsweepPingToPongSet); + res.downsweepPongToPing = builder.Use(_downsweepPongToPingSet); + + return res; +} + +void RadixSort::RecordSort(Renderer::CommandList& commandList, u8 frameIndex, + const PassResources& passRes, u32 numKeys) +{ + if (numKeys == 0) + return; + + const u32 partitionCount = (numKeys + PARTITION_SIZE - 1) / PARTITION_SIZE; + + struct RadixPC { u32 pass; u32 elementCount; }; + + commandList.PushMarker("RadixSort", Color::Green); + + // Zero the 4 KiB global histogram before each sort. Spine writes a prefix-sum over the per- + // pass counts accumulated by upsweep into this buffer; we need a clean slate. + commandList.FillBuffer(passRes.globalHistogram, 0, NUM_RADIX_PASSES * RADIX * sizeof(u32), 0); + commandList.BufferBarrier(passRes.globalHistogram, Renderer::BufferPassUsage::TRANSFER); + + for (u32 pass = 0; pass < NUM_RADIX_PASSES; ++pass) + { + const bool fromPing = (pass & 1) == 0; + RadixPC pc{ pass, numKeys }; + + // Upsweep: builds per-partition histograms + global histogram for this pass. + commandList.BeginPipeline(_upsweepPipeline); + commandList.PushConstant(&pc, 0, sizeof(pc)); + commandList.BindDescriptorSet(fromPing ? passRes.upsweepFromPing : passRes.upsweepFromPong, frameIndex); + commandList.Dispatch(partitionCount, 1, 1); + commandList.EndPipeline(_upsweepPipeline); + + commandList.BufferBarrier(passRes.sortKeys, Renderer::BufferPassUsage::COMPUTE); + + // Spine: prefix-scan the per-partition histograms (one group per radix bin) + prefix-scan + // the global histogram for this pass (bin 0's group handles that). + commandList.BeginPipeline(_spinePipeline); + commandList.PushConstant(&pc, 0, sizeof(pc)); + commandList.BindDescriptorSet(passRes.spine, frameIndex); + commandList.Dispatch(RADIX, 1, 1); + commandList.EndPipeline(_spinePipeline); + + commandList.BufferBarrier(passRes.sortKeys, Renderer::BufferPassUsage::COMPUTE); + + // Downsweep: scatter keys and values to their globally sorted positions for this pass. + commandList.BeginPipeline(_downsweepPipeline); + commandList.PushConstant(&pc, 0, sizeof(pc)); + commandList.BindDescriptorSet(fromPing ? passRes.downsweepPingToPong : passRes.downsweepPongToPing, frameIndex); + commandList.Dispatch(partitionCount, 1, 1); + commandList.EndPipeline(_downsweepPipeline); + + if (pass + 1 < NUM_RADIX_PASSES) + commandList.BufferBarrier(passRes.sortKeys, Renderer::BufferPassUsage::COMPUTE); + } + + commandList.PopMarker(); +} diff --git a/Source/Game-Lib/Game-Lib/Rendering/Util/RadixSort.h b/Source/Game-Lib/Game-Lib/Rendering/Util/RadixSort.h new file mode 100644 index 00000000..7661771e --- /dev/null +++ b/Source/Game-Lib/Game-Lib/Rendering/Util/RadixSort.h @@ -0,0 +1,113 @@ +#pragma once +#include + +#include +#include +#include +#include +#include + +namespace Renderer +{ + class Renderer; + class RenderGraphBuilder; + class CommandList; +} + +class GameRenderer; + +// GPU u32 radix sort (reduce-then-scan), 8 bits per pass => 4 passes per u32 key, 3 dispatches per +// pass => 12 dispatches per sort. Port of https://github.com/jaesung-cs/vulkan_radix_sort. +// +// Architecture: a single descriptor-set family shared across every sort in the frame. Inputs are +// staged into `sortKeys`/`sortValues` via vkCmdCopyBuffer before each sort. All descriptor sets +// bind fixed scratch buffers, are bound once at Init, and never rebind at runtime -- keeps SSBO +// descriptor-pool cost flat regardless of how many sorts run per frame. +// +// Usage: +// 1. Call `RegisterPass(builder)` once inside a render-graph pass's onSetup lambda. +// 2. For each sort inside onExecute: +// a. CopyBuffer caller's keys -> passRes.sortKeys. +// b. CopyBuffer identityValues -> passRes.sortValues (seed payload = [0..N-1]). +// c. Barriers from TRANSFER to COMPUTE on sortKeys / sortValues. +// d. Call `RecordSort(cl, frameIndex, passRes, numKeys)`. +// After RecordSort returns, passRes.sortKeys/sortValues contain the sorted pairs in place +// (4 passes = even count, so the ping-pong lands back on the input buffers). +// +// Growth: `EnsureCapacity(N)` reallocates size-dependent scratch to at least N keys and rebinds +// the 5 sort descriptor sets. Must be called CPU-side (outside render-graph execution). +class RadixSort +{ +public: + struct PassResources + { + Renderer::DescriptorSetResource upsweepFromPing; + Renderer::DescriptorSetResource upsweepFromPong; + Renderer::DescriptorSetResource spine; + Renderer::DescriptorSetResource downsweepPingToPong; + Renderer::DescriptorSetResource downsweepPongToPing; + + // Shared-scratch handles the caller copies INTO before calling RecordSort. + Renderer::BufferMutableResource sortKeys; // u32[maxN] + Renderer::BufferMutableResource sortValues; // u32[maxN] + + // Source the caller copies FROM when seeding sortValues = [0..N-1]. + Renderer::BufferResource identityValues; // u32[maxN] + + // Internal scratch exposed only so RecordSort can FillBuffer / BufferBarrier it. + // Callers shouldn't touch this. + Renderer::BufferMutableResource globalHistogram; + }; + + RadixSort(); + + void Init(Renderer::Renderer* renderer, GameRenderer* gameRenderer, u32 maxKeyCount); + + PassResources RegisterPass(Renderer::RenderGraphBuilder& builder); + + void RecordSort(Renderer::CommandList& commandList, u8 frameIndex, + const PassResources& passRes, u32 numKeys); + + void EnsureCapacity(u32 requiredMaxKeyCount); + + Renderer::BufferID GetSortValuesBuffer() const { return _sortValues; } + u32 GetMaxKeyCount() const { return _maxKeyCount; } + +private: + void CreatePipelines(); + void AllocateFixedScratch(); // globalHistogram (fixed, 4 KiB) + void AllocateKeyCountScratch(u32 newMaxKeyCount); // sortKeys/sortValues/writeKeys/writeValues/identityValues/partitionHistogram + void BindAllDescriptorSets(); + +private: + Renderer::Renderer* _renderer = nullptr; + GameRenderer* _gameRenderer = nullptr; + + u32 _maxKeyCount = 0; + + // --- Pipelines (3) -------------------------------------------------------------------- + Renderer::ComputePipelineID _upsweepPipeline; + Renderer::ComputePipelineID _spinePipeline; + Renderer::ComputePipelineID _downsweepPipeline; // compiled with KEY_VALUE=1 permutation + + // --- 5 shared descriptor sets (bound once, rebound only by EnsureCapacity) ------------ + Renderer::DescriptorSet _upsweepFromPingSet; + Renderer::DescriptorSet _upsweepFromPongSet; + Renderer::DescriptorSet _spineSet; + Renderer::DescriptorSet _downsweepPingToPongSet; + Renderer::DescriptorSet _downsweepPongToPingSet; + + // --- Scratch buffers ------------------------------------------------------------------ + // Size-dependent on _maxKeyCount (resized by EnsureCapacity): + Renderer::BufferID _sortKeys = Renderer::BufferID::Invalid(); // u32[maxN] + Renderer::BufferID _sortValues = Renderer::BufferID::Invalid(); // u32[maxN] + Renderer::BufferID _writeKeys = Renderer::BufferID::Invalid(); // u32[maxN] ping-pong + Renderer::BufferID _writeValues = Renderer::BufferID::Invalid(); // u32[maxN] ping-pong + Renderer::BufferID _identityValues = Renderer::BufferID::Invalid(); // u32[maxN] = [0..maxN-1] + Renderer::BufferID _partitionHistogram = Renderer::BufferID::Invalid(); // u32[maxPartitions * 256] + + // Fixed-size scratch (never resized): + Renderer::BufferID _globalHistogram; // u32[4 * 256] = 4 KiB + + bool _initialized = false; +}; diff --git a/Source/Resources/Scripts/API/Input/Input.luau b/Source/Resources/Scripts/API/Input/Input.luau index c2cf01a8..f47052a6 100644 --- a/Source/Resources/Scripts/API/Input/Input.luau +++ b/Source/Resources/Scripts/API/Input/Input.luau @@ -80,7 +80,7 @@ inputTableAPI.keyNames = [284] = "PAUSE", [290] = "F1", [291] = "F2", - [292] = "F2", + [292] = "F3", [293] = "F4", [294] = "F5", [295] = "F6", diff --git a/Source/Resources/Scripts/UI/Demo.luau b/Source/Resources/Scripts/UI/Demo.luau index d2d18fea..8b625061 100644 --- a/Source/Resources/Scripts/UI/Demo.luau +++ b/Source/Resources/Scripts/UI/Demo.luau @@ -63,16 +63,16 @@ local function CreateMultiLineDemo(rtCanvas : Canvas) -- Create a panel local panel = canvas:NewPanel(0, 0, 400, 400, 0, "DialogBox"); - --panel:SetAnchor(0.5, 0.5); - --panel:SetRelativePoint(0.5, 0.5); + panel:SetAnchor(0.5, 0.5); + panel:SetRelativePoint(0.5, 0.5); - panel:SetAnchor(0.0, 0.0); - panel:SetRelativePoint(1.0, 0.0); + --panel:SetAnchor(0.0, 0.0); + --panel:SetRelativePoint(1.0, 0.0); --panel:DebugSetWorldTransformIndex(0); local middlePanel = panel:NewPanel(0, 0, 50, 50, 0, "DialogBox"); - middlePanel:SetAnchor(0.0, 0.0); - middlePanel:SetRelativePoint(0.0, 0.0); + middlePanel:SetAnchor(0.5, 0.5); + middlePanel:SetRelativePoint(0.5, 0.5); local bottomLeft = panel:NewText("Bottom Left\nLeft Bottom", 0, 0, 0, "DefaultButtonText"); bottomLeft:SetAnchor(0.0, 0.0); @@ -110,6 +110,408 @@ local function Demo() CreateMultiLineDemo(rtCanvas) end +-- ===================================================================== +-- SortingDemo +-- +-- Visual regression case for the upcoming GPU-driven UI renderer (merged +-- Widget pipeline + GPU sort + indirect draw). Run this BEFORE the +-- refactor and screenshot it as a reference, then run it AFTER and +-- compare. Every test must look pixel-identical, modulo deliberate new +-- behaviour: tests 7 and 8 start exercising the layer parameter once +-- GPU sort honors it. +-- +-- Sibling order: siblings are drawn in INSERTION ORDER (first inserted +-- on the bottom, last inserted on top). Several tests use 4+ siblings +-- to verify this directly. +-- ===================================================================== +local function CreateSortingDemo() + -- Local debug colors so we don't pollute Templates.luau. Re-registering an + -- existing template name is a no-op (or overwrite, either way harmless). + UI.RegisterPanelTemplate("SortDemoOrange", { cornerRadius = 0.0, color = vector.create(1.00, 0.55, 0.00) }); + UI.RegisterPanelTemplate("SortDemoYellow", { cornerRadius = 0.0, color = vector.create(1.00, 0.95, 0.00) }); + UI.RegisterPanelTemplate("SortDemoCyan", { cornerRadius = 0.0, color = vector.create(0.00, 0.85, 0.95) }); + UI.RegisterPanelTemplate("SortDemoMagenta", { cornerRadius = 0.0, color = vector.create(1.00, 0.00, 0.80) }); + UI.RegisterPanelTemplate("SortDemoCellBG", { cornerRadius = 0.0, color = vector.create(0.10, 0.10, 0.13) }); + + -- ======================================================================= + -- Test 9 extras: two top-level canvases that overlap inside cell 9's + -- screen-space area. entt's view iterates LAST-CREATED first, so + -- the first-created canvas ends up drawn LAST (on top). To get the + -- desired stack from bottom to top: + -- SortingDemo (bottom, cell 9 background) + -- CanvasA (middle) + -- CanvasB (top) + -- ...we must create them in REVERSE of that order here: + -- CanvasB first, CanvasA next, SortingDemo last. + -- + -- Cell 9 screen bounds (col=2, row=2): x 1284..1902, y 14..344. + -- Panels are placed in y 130..280 so they don't overlap the cell title + -- at ~y 336 or the multi-line hint at ~y 22..110. + -- ======================================================================= + do + -- CanvasB (created FIRST => drawn LAST => on top) + local canvasB : Canvas = UI.GetCanvas("SortingDemo_CanvasB", 0, 0, 1920, 1080) + local bPanel = canvasB:NewPanel(1490, 130, 200, 130, 0, "DebugGreen") + bPanel:SetAnchor(0.0, 0.0); bPanel:SetRelativePoint(0.0, 0.0) + local bChild = bPanel:NewPanel(0, 0, 140, 80, 0, "SortDemoYellow") + bChild:SetAnchor(0.5, 0.5); bChild:SetRelativePoint(0.5, 0.5) + local bText = bPanel:NewText("CanvasB", 10, 10, 0, "DefaultButtonText") + bText:SetAnchor(0.0, 0.0); bText:SetRelativePoint(0.0, 0.0) + + -- CanvasA (created SECOND => drawn in the middle) + local canvasA : Canvas = UI.GetCanvas("SortingDemo_CanvasA", 0, 0, 1920, 1080) + local aPanel = canvasA:NewPanel(1340, 150, 200, 130, 0, "DebugRed") + aPanel:SetAnchor(0.0, 0.0); aPanel:SetRelativePoint(0.0, 0.0) + local aChild = aPanel:NewPanel(0, 0, 140, 80, 0, "DebugBlue") + aChild:SetAnchor(0.5, 0.5); aChild:SetRelativePoint(0.5, 0.5) + local aText = aPanel:NewText("CanvasA", 10, 10, 0, "DefaultButtonText") + aText:SetAnchor(0.0, 0.0); aText:SetRelativePoint(0.0, 0.0) + end + + -- Main SortingDemo canvas. Created LAST of the three so it's iterated + -- first and drawn on the bottom, letting CanvasA and CanvasB show through + -- over cell 9's background. + local canvas : Canvas = UI.GetCanvas("SortingDemo", 0, 0, 1920, 1080); + + -- Top title bar + local title = canvas:NewText("SortingDemo - validate before/after GPU-sort refactor", 0, -6, 0, "DefaultButtonText"); + title:SetAnchor(0.5, 1.0); + title:SetRelativePoint(0.5, 1.0); + + -- 3 columns x 3 rows. row 0 = TOP, row 2 = bottom. col 0 = left, col 2 = right. + -- Numbering reads left-to-right, top-to-bottom (row*3 + col + 1). + local canvasH = 1080 + local topMargin = 28 -- room for title bar above the cells + local cellW, cellH = 618, 330 + local padX, padY = 16, 16 + + local function makeCell(col, row, label) + local x = padX + col * (cellW + padX) + local y = (canvasH - topMargin) - (cellH + padY) * (row + 1) + + -- Cell background frame + local cell = canvas:NewPanel(x, y, cellW, cellH, 0, "SortDemoCellBG") + cell:SetAnchor(0.0, 0.0) + cell:SetRelativePoint(0.0, 0.0) + + -- Cell title (top-left corner of cell) + local titleWidget = cell:NewText(label, 10, -8, 0, "DefaultButtonText") + titleWidget:SetAnchor(0.0, 1.0) + titleWidget:SetRelativePoint(0.0, 1.0) + + return cell + end + + -- ========================================================= + -- Test 1 [top-left]: Sibling panels - 4-step staircase + -- 4 overlapping panels inserted in order R, G, B, Yellow. + -- Verifies the fixed insertion-order iteration with >2 siblings. + -- ========================================================= + do + local cell = makeCell(0, 0, "1: Sibling Panels (x4)") + + local templates = { "DebugRed", "DebugGreen", "DebugBlue", "SortDemoYellow" } + local size = 130 + local step = 18 + local startOffset = -((#templates - 1) * step) / 2 + for i = 1, #templates do + local off = startOffset + (i - 1) * step + local p = cell:NewPanel(off, -off, size, size, 0, templates[i]) + p:SetAnchor(0.5, 0.5); p:SetRelativePoint(0.5, 0.5) + end + + local hintText = [[ +cell: + Red + Green + Blue + Yellow <- on top]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + + -- ========================================================= + -- Test 2 [top-mid]: Sibling text - 4-step insertion order + -- Y step is half the X step so consecutive labels actually + -- overlap vertically (text height ~22px, Y step = 9px). + -- ========================================================= + do + local cell = makeCell(1, 0, "2: Sibling Text (x4)") + + local labels = { "FIRST", "SECOND", "THIRD", "FOURTH" } + local stepX = 18 + local stepY = 9 + local startX = -((#labels - 1) * stepX) / 2 + local startY = -((#labels - 1) * stepY) / 2 + for i = 1, #labels do + local offX = startX + (i - 1) * stepX + local offY = startY + (i - 1) * stepY + local t = cell:NewText(labels[i], offX, -offY, 0, "DefaultButtonText") + t:SetAnchor(0.5, 0.5); t:SetRelativePoint(0.5, 0.5) + end + + local hintText = [[ +cell: + FIRST + SECOND + THIRD + FOURTH <- on top]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + + -- ========================================================= + -- Test 3 [top-right]: Mixed Panel + Text (sibling, both directions) + -- Two pairs in one cell, each isolated from the other: + -- Pair A (left side): Panel inserted FIRST, Text inserted SECOND. + -- Expected: text draws ON TOP of the panel. + -- Pair B (right side): Text inserted FIRST, Panel inserted SECOND. + -- Expected: opaque panel HIDES the text. + -- This is the critical test for the merged Widget pipeline - the + -- old two-pipeline path could not interleave panel and text draws + -- in arbitrary insertion order. + -- ========================================================= + do + local cell = makeCell(2, 0, "3: Mixed P+T sibling (both ways)") + + -- Pair A: panel first, text second (text expected on top) + local pA = cell:NewPanel(-130, 25, 200, 70, 0, "DebugBlue") + pA:SetAnchor(0.5, 0.5); pA:SetRelativePoint(0.5, 0.5) + local tA = cell:NewText("Text over Blue", -130, 25, 0, "DefaultButtonText") + tA:SetAnchor(0.5, 0.5); tA:SetRelativePoint(0.5, 0.5) + + -- Pair B: text first, panel second (panel expected to hide text) + local tB = cell:NewText("Hidden by Red", 130, -25, 0, "DefaultButtonText") + tB:SetAnchor(0.5, 0.5); tB:SetRelativePoint(0.5, 0.5) + local pB = cell:NewPanel(130, -25, 200, 70, 0, "DebugRed") + pB:SetAnchor(0.5, 0.5); pB:SetRelativePoint(0.5, 0.5) + + local hintText = [[ +cell: + Blue panel + "Text over Blue" <- VISIBLE + + "Hidden by Red" <- INVISIBLE + Red panel]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + + -- ========================================================= + -- Test 4 [mid-left]: Hierarchy depth (single-child chain) + -- Single-child means there's no sibling order at all; each + -- inner level cleanly draws on top of its parent. Tests + -- depth-first traversal of a deep chain. + -- ========================================================= + do + local cell = makeCell(0, 1, "4: Hierarchy Depth") + + local p1 = cell:NewPanel(0, 0, 180, 180, 0, "DebugRed") + p1:SetAnchor(0.5, 0.5); p1:SetRelativePoint(0.5, 0.5) + + local p2 = p1:NewPanel(0, 0, 130, 130, 0, "DebugGreen") + p2:SetAnchor(0.5, 0.5); p2:SetRelativePoint(0.5, 0.5) + + local p3 = p2:NewPanel(0, 0, 90, 90, 0, "DebugBlue") + p3:SetAnchor(0.5, 0.5); p3:SetRelativePoint(0.5, 0.5) + + local p4 = p3:NewPanel(0, 0, 50, 50, 0, "SortDemoYellow") + p4:SetAnchor(0.5, 0.5); p4:SetRelativePoint(0.5, 0.5) + + local label = p4:NewText("DEEP", 0, 0, 0, "DefaultDebugText") + label:SetAnchor(0.5, 0.5); label:SetRelativePoint(0.5, 0.5) + + local hintText = [[ +cell: + Red + Green + Blue + Yellow + "DEEP"]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + + -- ========================================================= + -- Test 5 [mid-mid]: Branching hierarchy + -- A single parent panel with two non-overlapping child subtrees. + -- Each child has its own grandchild text. Verifies depth-first + -- traversal across MULTIPLE BRANCHES at depth >= 2: + -- parent -> leftChild -> leftText -> rightChild -> rightText + -- ========================================================= + do + local cell = makeCell(1, 1, "5: Branching Hierarchy") + + local parent = cell:NewPanel(0, 0, 414, 153, 0, "DebugDarkGrey") + parent:SetAnchor(0.5, 0.5); parent:SetRelativePoint(0.5, 0.5) + + local leftChild = parent:NewPanel(-105, 0, 180, 120, 0, "DebugRed") + leftChild:SetAnchor(0.5, 0.5); leftChild:SetRelativePoint(0.5, 0.5) + local leftText = leftChild:NewText("LEFT", 0, 0, 0, "DefaultButtonText") + leftText:SetAnchor(0.5, 0.5); leftText:SetRelativePoint(0.5, 0.5) + + local rightChild = parent:NewPanel(105, 0, 180, 120, 0, "DebugBlue") + rightChild:SetAnchor(0.5, 0.5); rightChild:SetRelativePoint(0.5, 0.5) + local rightText = rightChild:NewText("RIGHT", 0, 0, 0, "DefaultButtonText") + rightText:SetAnchor(0.5, 0.5); rightText:SetRelativePoint(0.5, 0.5) + + local hintText = [[ +cell: + DarkGrey parent + Red leftChild + "LEFT" + Blue rightChild + "RIGHT"]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + + -- ========================================================= + -- Test 6 [mid-right]: Two top-level subtrees that overlap + -- Two parent panels (A then B) inserted as siblings of the + -- cell, each containing a child panel and a child text. Verifies + -- that the entire B subtree draws over the entire A subtree, not + -- interleaved at the leaves. + -- ========================================================= + do + local cell = makeCell(2, 1, "6: Two-Parent Subtrees") + + -- Parent A (DarkGrey base) inserted first + local parentA = cell:NewPanel(-50, -15, 240, 160, 0, "DebugDarkGrey") + parentA:SetAnchor(0.5, 0.5); parentA:SetRelativePoint(0.5, 0.5) + local aChildPanel = parentA:NewPanel(0, 0, 170, 110, 0, "DebugRed") + aChildPanel:SetAnchor(0.5, 0.5); aChildPanel:SetRelativePoint(0.5, 0.5) + local aChildText = parentA:NewText("A.text", 0, 0, 0, "DefaultButtonText") + aChildText:SetAnchor(0.5, 0.5); aChildText:SetRelativePoint(0.5, 0.5) + + -- Parent B (Cyan tinted) inserted second, overlapping parent A + local parentB = cell:NewPanel(50, 15, 240, 160, 0, "SortDemoCyan") + parentB:SetAnchor(0.5, 0.5); parentB:SetRelativePoint(0.5, 0.5) + local bChildPanel = parentB:NewPanel(0, 0, 170, 110, 0, "DebugBlue") + bChildPanel:SetAnchor(0.5, 0.5); bChildPanel:SetRelativePoint(0.5, 0.5) + local bChildText = parentB:NewText("B.text", 0, 0, 0, "DefaultButtonText") + bChildText:SetAnchor(0.5, 0.5); bChildText:SetRelativePoint(0.5, 0.5) + + local hintText = [[ +cell: + DarkGrey parentA + Red child + "A.text" + Cyan parentB + Blue child + "B.text"]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + + -- ========================================================= + -- Test 7 [bot-left]: Z-index (layer) - 3 panels, gradient sort + -- Three sibling panels with three DIFFERENT layer values to + -- verify the post-refactor layer sort handles a gradient (not + -- just a binary high/low). The MIDDLE panel has the highest + -- layer, so post-refactor it should be on top despite being + -- inserted second. + -- This is the ONE test whose visual is expected to change after + -- the refactor. + -- ========================================================= + do + local cell = makeCell(0, 2, "7: Z-index (3 layers)") + + local pRed = cell:NewPanel(-50, 20, 160, 160, 50, "DebugRed") + pRed:SetAnchor(0.5, 0.5); pRed:SetRelativePoint(0.5, 0.5) + + local pGreen = cell:NewPanel( 0, 0, 160, 160, 100, "DebugGreen") + pGreen:SetAnchor(0.5, 0.5); pGreen:SetRelativePoint(0.5, 0.5) + pGreen:SetAlpha(0.5) -- semi-transparent so we can see whether Red or Blue is stacked under it + + local pBlue = cell:NewPanel( 50, -20, 160, 160, 25, "DebugBlue") + pBlue:SetAnchor(0.5, 0.5); pBlue:SetRelativePoint(0.5, 0.5) + + local hintText = [[ +cell: + Red (layer 50, 1st) + Green (layer 100, 2nd) + Blue (layer 25, 3rd) + +BEFORE: Blue on top (insertion order) +AFTER : Green on top, Red middle, Blue bottom + (sorted by layer descending)]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + + -- ========================================================= + -- Test 8 [bot-mid]: Combined nesting + siblings + layers + -- A parent panel with two child subtrees, where the FIRST + -- subtree is given a high layer. Pre-refactor: insertion order + -- means the second subtree covers the first. Post-refactor: + -- the layer-elevated subtree should rise above the second. + -- Forces the sort design to commit to "does layer cross + -- subtree boundaries?" - if yes, the whole Red subtree (panel + -- + text) should appear on top, including its child text. + -- ========================================================= + do + local cell = makeCell(1, 2, "8: Nest + Siblings + Layers") + + local parent = cell:NewPanel(0, 10, 460, 170, 0, "DebugDarkGrey") + parent:SetAnchor(0.5, 0.5); parent:SetRelativePoint(0.5, 0.5) + + -- Red subtree, inserted FIRST, layer 100 + local redChild = parent:NewPanel(-70, 0, 200, 110, 100, "DebugRed") + redChild:SetAnchor(0.5, 0.5); redChild:SetRelativePoint(0.5, 0.5) + local redText = redChild:NewText("L", 0, 0, 0, "DefaultButtonText") + redText:SetAnchor(0.5, 0.5); redText:SetRelativePoint(0.5, 0.5) + + -- Green subtree, inserted SECOND, layer 0, OVERLAPS Red + local greenChild = parent:NewPanel(70, 0, 200, 110, 0, "DebugGreen") + greenChild:SetAnchor(0.5, 0.5); greenChild:SetRelativePoint(0.5, 0.5) + local greenText = greenChild:NewText("R", 0, 0, 0, "DefaultButtonText") + greenText:SetAnchor(0.5, 0.5); greenText:SetRelativePoint(0.5, 0.5) + + local hintText = [[ +cell: + DarkGrey parent + Red (layer 100, 1st) + "L" + Green (layer 0, 2nd) + "R" + +BEFORE: Green's subtree covers Red in overlap +AFTER : Red's subtree on top (layer 100)]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + + -- ========================================================= + -- Test 9 [bot-right]: Two overlapping canvases + -- Cell 9 itself just contains an explanatory label. The actual + -- test is two extra non-RT canvases (CanvasA and CanvasB) created + -- AFTER this cell at the top level - they overlap each other in + -- screen space within the bot-right cell area. + -- + -- Demonstrates that canvases render WHOLE one at a time: + -- everything in CanvasB draws above everything in CanvasA in the + -- overlap zone, never interleaved at the leaf level. (Once + -- per-canvas Z-index lands, that priority will determine which + -- canvas wins; today it's strictly creation order.) + -- ========================================================= + do + local cell = makeCell(2, 2, "9: Two Canvases (overlap)") + + local hintText = [[ +Two extra non-RT canvases overlap inside +this cell area. CanvasA was created first, +CanvasB second. + +CanvasB's entire contents draw on top of +CanvasA in the overlap zone - nothing from +the two canvases interleaves at the leaves.]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + +end + local function CreateClippingDemo() local canvas = UI.GetCanvas("ClippingDemo", 0, 0, 1920, 1080); @@ -140,6 +542,7 @@ local function OnGameLoaded(eventID : number, data : any) --CreateClippingDemo(); --Demo(); + --CreateSortingDemo(); --CreateGameMenu(stack); --CreateOptionsMenu(stack); diff --git a/Source/Shaders/Shaders/Include/Lighting.inc.slang b/Source/Shaders/Shaders/Include/Lighting.inc.slang index 9e5ca247..591166cf 100644 --- a/Source/Shaders/Shaders/Include/Lighting.inc.slang +++ b/Source/Shaders/Shaders/Include/Lighting.inc.slang @@ -39,9 +39,6 @@ float3 ApplyLighting(float2 uv, float3 materialColor, PixelVertexData pixelVerte { DirectionalLight light = LoadDirectionalLight(i); - light.groundAmbientColor.rgb = float3(0.4f, 0.4f, 0.4f); - light.skyAmbientColor.rgb = float3(0.4f, 0.4f, 0.4f); - // Ambient Light float nDotUp = saturate(dot(pixelVertexData.worldNormal, float3(0.0f, 1.0f, 0.0f))); // Dot product between normal and up direction float4 lightAmbientColor = lerp(light.groundAmbientColor, light.skyAmbientColor, nDotUp); // Ambient color based on normal diff --git a/Source/Shaders/Shaders/Sorting/FFX_ParallelSort.inc.slang b/Source/Shaders/Shaders/Sorting/FFX_ParallelSort.inc.slang deleted file mode 100644 index 079c31d1..00000000 --- a/Source/Shaders/Shaders/Sorting/FFX_ParallelSort.inc.slang +++ /dev/null @@ -1,454 +0,0 @@ -// FFX_ParallelSort.h -// -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#define FFX_PARALLELSORT_SORT_BITS_PER_PASS 4 -#define FFX_PARALLELSORT_SORT_BIN_COUNT (1 << FFX_PARALLELSORT_SORT_BITS_PER_PASS) -#define FFX_PARALLELSORT_ELEMENTS_PER_THREAD 4 -#define FFX_PARALLELSORT_THREADGROUP_SIZE 128 - -////////////////////////////////////////////////////////////////////////// -// ParallelSort constant buffer parameters: -// -// NumKeys The number of keys to sort -// Shift How many bits to shift for this sort pass (we sort 4 bits at a time) -// NumBlocksPerThreadGroup How many blocks of keys each thread group needs to process -// NumThreadGroups How many thread groups are being run concurrently for sort -// NumThreadGroupsWithAdditionalBlocks How many thread groups need to process additional block data -// NumReduceThreadgroupPerBin How many thread groups are summed together for each reduced bin entry -// NumScanValues How many values to perform scan prefix (+ add) on -////////////////////////////////////////////////////////////////////////// - -struct FFX_ParallelSortCB -{ - uint NumKeys; - int NumBlocksPerThreadGroup; - uint NumThreadGroups; - uint NumThreadGroupsWithAdditionalBlocks; - uint NumReduceThreadgroupPerBin; - uint NumScanValues; -}; - -groupshared uint gs_Histogram[FFX_PARALLELSORT_THREADGROUP_SIZE * FFX_PARALLELSORT_SORT_BIN_COUNT]; -void FFX_ParallelSort_Count_uint(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, uint ShiftBit, RWStructuredBuffer SrcBuffer, RWStructuredBuffer SumTable) -{ - // Start by clearing our local counts in LDS - for (int i = 0; i < FFX_PARALLELSORT_SORT_BIN_COUNT; i++) - gs_Histogram[(i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID] = 0; - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Data is processed in blocks, and how many we process can changed based on how much data we are processing - // versus how many thread groups we are processing with - int BlockSize = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; - - // Figure out this thread group's index into the block data (taking into account thread groups that need to do extra reads) - uint ThreadgroupBlockStart = (BlockSize * CBuffer.NumBlocksPerThreadGroup * groupID); - uint NumBlocksToProcess = CBuffer.NumBlocksPerThreadGroup; - - if (groupID >= CBuffer.NumThreadGroups - CBuffer.NumThreadGroupsWithAdditionalBlocks) - { - ThreadgroupBlockStart += (groupID - (CBuffer.NumThreadGroups - CBuffer.NumThreadGroupsWithAdditionalBlocks)) * BlockSize; - NumBlocksToProcess++; - } - - // Get the block start index for this thread - uint BlockIndex = ThreadgroupBlockStart + localID; - - // Count value occurrence - for (uint BlockCount = 0; BlockCount < NumBlocksToProcess; BlockCount++, BlockIndex += BlockSize) - { - uint DataIndex = BlockIndex; - - // Pre-load the key values in order to hide some of the read latency - uint64_t srcKeys[FFX_PARALLELSORT_ELEMENTS_PER_THREAD]; - srcKeys[0] = SrcBuffer[DataIndex]; - srcKeys[1] = SrcBuffer[DataIndex + FFX_PARALLELSORT_THREADGROUP_SIZE]; - srcKeys[2] = SrcBuffer[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 2)]; - srcKeys[3] = SrcBuffer[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 3)]; - - for (uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) - { - if (DataIndex < CBuffer.NumKeys) - { - uint64_t localKey = (srcKeys[i] >> ShiftBit) & 0xf; - InterlockedAdd(gs_Histogram[(localKey * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID], 1l); - DataIndex += FFX_PARALLELSORT_THREADGROUP_SIZE; - } - } - } - - // Even though our LDS layout guarantees no collisions, our thread group size is greater than a wave - // so we need to make sure all thread groups are done counting before we start tallying up the results - GroupMemoryBarrierWithGroupSync(); - - if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) - { - uint64_t sum = 0; - for (int i = 0; i < FFX_PARALLELSORT_THREADGROUP_SIZE; i++) - { - sum += gs_Histogram[localID * FFX_PARALLELSORT_THREADGROUP_SIZE + i]; - } - SumTable[localID * CBuffer.NumThreadGroups + groupID] = sum; - } -} - -groupshared uint64_t gs_LDSSums[FFX_PARALLELSORT_THREADGROUP_SIZE]; -uint64_t FFX_ParallelSort_ThreadgroupReduce(uint64_t localSum, uint localID) -{ - // Do wave local reduce - uint64_t waveReduced = WaveActiveSum(localSum); - - // First lane in a wave writes out wave reduction to LDS (this accounts for num waves per group greater than HW wave size) - // Note that some hardware with very small HW wave sizes (i.e. <= 8) may exhibit issues with this algorithm, and have not been tested. - uint waveID = localID / WaveGetLaneCount(); - if (WaveIsFirstLane()) - gs_LDSSums[waveID] = waveReduced; - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // First wave worth of threads sum up wave reductions - if (!waveID) - waveReduced = WaveActiveSum( (localID < FFX_PARALLELSORT_THREADGROUP_SIZE / WaveGetLaneCount()) ? gs_LDSSums[localID] : 0); - - // Returned the reduced sum - return waveReduced; -} - -uint64_t FFX_ParallelSort_BlockScanPrefix(uint64_t localSum, uint localID) -{ - // Do wave local scan-prefix - uint64_t wavePrefixed = WavePrefixSum(localSum); - - // Since we are dealing with thread group sizes greater than HW wave size, we need to account for what wave we are in. - uint waveID = localID / WaveGetLaneCount(); - uint laneID = WaveGetLaneIndex(); - - // Last element in a wave writes out partial sum to LDS - if (laneID == WaveGetLaneCount() - 1) - gs_LDSSums[waveID] = wavePrefixed + localSum; - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // First wave prefixes partial sums - if (!waveID) - gs_LDSSums[localID] = WavePrefixSum(gs_LDSSums[localID]); - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Add the partial sums back to each wave prefix - wavePrefixed += gs_LDSSums[waveID]; - - return wavePrefixed; -} - -void FFX_ParallelSort_ReduceCount(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, RWStructuredBuffer SumTable, RWStructuredBuffer ReduceTable) -{ - // Figure out what bin data we are reducing - uint BinID = groupID / CBuffer.NumReduceThreadgroupPerBin; - uint BinOffset = BinID * CBuffer.NumThreadGroups; - - // Get the base index for this thread group - uint BaseIndex = (groupID % CBuffer.NumReduceThreadgroupPerBin) * FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; - - // Calculate partial sums for entries this thread reads in - uint64_t threadgroupSum = 0; - for (uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; ++i) - { - uint DataIndex = BaseIndex + (i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID; - threadgroupSum += (DataIndex < CBuffer.NumThreadGroups) ? SumTable[BinOffset + DataIndex] : 0; - } - - // Reduce across the entirety of the thread group - threadgroupSum = FFX_ParallelSort_ThreadgroupReduce(threadgroupSum, localID); - - // First thread of the group writes out the reduced sum for the bin - if (!localID) - ReduceTable[groupID] = threadgroupSum; - - // What this will look like in the reduced table is: - // [ [bin0 ... bin0] [bin1 ... bin1] ... ] -} - -// This is to transform uncoalesced loads into coalesced loads and -// then scattered loads from LDS -groupshared uint64_t gs_LDS[FFX_PARALLELSORT_ELEMENTS_PER_THREAD][FFX_PARALLELSORT_THREADGROUP_SIZE]; -void FFX_ParallelSort_ScanPrefix(uint numValuesToScan, uint localID, uint groupID, uint BinOffset, uint BaseIndex, bool AddPartialSums, - FFX_ParallelSortCB CBuffer, RWStructuredBuffer ScanSrc, RWStructuredBuffer ScanDst, RWStructuredBuffer ScanScratch) -{ - // Perform coalesced loads into LDS - { - for(uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) - { - uint DataIndex = BaseIndex + (i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID; - - uint col = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) / FFX_PARALLELSORT_ELEMENTS_PER_THREAD; - uint row = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) % FFX_PARALLELSORT_ELEMENTS_PER_THREAD; - gs_LDS[row][col] = (DataIndex < numValuesToScan) ? ScanSrc[BinOffset + DataIndex] : 0; - } - } - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - uint64_t threadgroupSum = 0; - // Calculate the local scan-prefix for current thread - { - for(uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) - { - uint64_t tmp = gs_LDS[i][localID]; - gs_LDS[i][localID] = threadgroupSum; - threadgroupSum += tmp; - } - } - - // Scan prefix partial sums - threadgroupSum = FFX_ParallelSort_BlockScanPrefix(threadgroupSum, localID); - - // Add reduced partial sums if requested - uint64_t partialSum = 0; - if (AddPartialSums) - { - // Partial sum additions are a little special as they are tailored to the optimal number of - // thread groups we ran in the beginning, so need to take that into account - partialSum = ScanScratch[groupID]; - } - - // Add the block scanned-prefixes back in - { - for(uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) - gs_LDS[i][localID] += threadgroupSum; - } - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Perform coalesced writes to scan dst - { - for(uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) - { - uint DataIndex = BaseIndex + (i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID; - - uint col = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) / FFX_PARALLELSORT_ELEMENTS_PER_THREAD; - uint row = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) % FFX_PARALLELSORT_ELEMENTS_PER_THREAD; - - if(DataIndex < numValuesToScan) - ScanDst[BinOffset + DataIndex] = gs_LDS[row][col] + partialSum; - } - } -} - -// Offset cache to avoid loading the offsets all the time -groupshared uint64_t gs_BinOffsetCache[FFX_PARALLELSORT_THREADGROUP_SIZE]; -// Local histogram for offset calculations -groupshared uint gs_LocalHistogram[FFX_PARALLELSORT_SORT_BIN_COUNT]; -// Scratch area for algorithm -groupshared uint64_t gs_LDSScratch[FFX_PARALLELSORT_THREADGROUP_SIZE]; -// LDS for uint sums -groupshared uint gs_LDSSums_uint[FFX_PARALLELSORT_THREADGROUP_SIZE]; -void FFX_ParallelSort_Scatter_uint(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, uint ShiftBit, RWStructuredBuffer SrcBuffer, RWStructuredBuffer DstBuffer, - RWStructuredBuffer SumTable, RWStructuredBuffer SrcPayload, RWStructuredBuffer DstPayload) -{ - // Load the sort bin threadgroup offsets into LDS for faster referencing - if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) - gs_BinOffsetCache[localID] = SumTable[localID * CBuffer.NumThreadGroups + groupID]; - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Data is processed in blocks, and how many we process can changed based on how much data we are processing - // versus how many thread groups we are processing with - int BlockSize = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; - - // Figure out this thread group's index into the block data (taking into account thread groups that need to do extra reads) - uint ThreadgroupBlockStart = (BlockSize * CBuffer.NumBlocksPerThreadGroup * groupID); - uint NumBlocksToProcess = CBuffer.NumBlocksPerThreadGroup; - - if (groupID >= CBuffer.NumThreadGroups - CBuffer.NumThreadGroupsWithAdditionalBlocks) - { - ThreadgroupBlockStart += (groupID - (CBuffer.NumThreadGroups - CBuffer.NumThreadGroupsWithAdditionalBlocks)) * BlockSize; - NumBlocksToProcess++; - } - - // Get the block start index for this thread - uint BlockIndex = ThreadgroupBlockStart + localID; - - // Count value occurences - uint newCount; - for (int BlockCount = 0; BlockCount < NumBlocksToProcess; BlockCount++, BlockIndex += BlockSize) - { - uint DataIndex = BlockIndex; - - // Pre-load the key values in order to hide some of the read latency - uint64_t srcKeys[FFX_PARALLELSORT_ELEMENTS_PER_THREAD]; - srcKeys[0] = SrcBuffer[DataIndex]; - srcKeys[1] = SrcBuffer[DataIndex + FFX_PARALLELSORT_THREADGROUP_SIZE]; - srcKeys[2] = SrcBuffer[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 2)]; - srcKeys[3] = SrcBuffer[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 3)]; - - uint srcValues[FFX_PARALLELSORT_ELEMENTS_PER_THREAD]; - srcValues[0] = SrcPayload[DataIndex]; - srcValues[1] = SrcPayload[DataIndex + FFX_PARALLELSORT_THREADGROUP_SIZE]; - srcValues[2] = SrcPayload[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 2)]; - srcValues[3] = SrcPayload[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 3)]; - - for (int i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) - { - // Clear the local histogram - if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) - gs_LocalHistogram[localID] = 0; - - uint64_t localKey = (DataIndex < CBuffer.NumKeys ? srcKeys[i] : 0xffffffffffffffff); - uint localValue = (DataIndex < CBuffer.NumKeys ? srcValues[i] : 0); - - // Sort the keys locally in LDS - for (uint bitShift = 0; bitShift < FFX_PARALLELSORT_SORT_BITS_PER_PASS; bitShift += 2) - { - // Figure out the keyIndex - uint64_t keyIndex = (localKey >> ShiftBit) & 0xf; - uint64_t bitKey = (keyIndex >> bitShift) & 0x3; - - // Create a packed histogram - uint64_t packedHistogram = (uint64_t)1 << (bitKey * 8); - - // Sum up all the packed keys (generates counted offsets up to current thread group) - uint64_t localSum = FFX_ParallelSort_BlockScanPrefix(packedHistogram, localID); - - // Last thread stores the updated histogram counts for the thread group - // Scratch = 0xsum3|sum2|sum1|sum0 for thread group - if (localID == (FFX_PARALLELSORT_THREADGROUP_SIZE - 1)) - gs_LDSScratch[0] = localSum + packedHistogram; - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Load the sums value for the thread group - packedHistogram = gs_LDSScratch[0]; - - // Add prefix offsets for all 4 bit "keys" (packedHistogram = 0xsum2_1_0|sum1_0|sum0|0) - packedHistogram = (packedHistogram << 8) + (packedHistogram << 16) + (packedHistogram << 24); - - // Calculate the proper offset for this thread's value - localSum += packedHistogram; - - // Calculate target offset - uint64_t keyOffset = (localSum >> (bitKey * 8)) & 0xff; - - // Re-arrange the keys (store, sync, load) - gs_LDSSums[keyOffset] = localKey; - GroupMemoryBarrierWithGroupSync(); - localKey = gs_LDSSums[localID]; - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Re-arrange the values if we have them (store, sync, load) - gs_LDSSums_uint[keyOffset] = localValue; - GroupMemoryBarrierWithGroupSync(); - localValue = gs_LDSSums_uint[localID]; - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - } - - // Need to recalculate the keyIndex on this thread now that values have been copied around the thread group - uint64_t keyIndex = (localKey >> ShiftBit) & 0xf; - - // Reconstruct histogram - InterlockedAdd(gs_LocalHistogram[keyIndex], 1); - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Prefix histogram - uint histogramPrefixSum = WavePrefixSum(localID < FFX_PARALLELSORT_SORT_BIN_COUNT ? gs_LocalHistogram[localID] : 0); - - // Broadcast prefix-sum via LDS - if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) - gs_LDSScratch[localID] = histogramPrefixSum; - - // Get the global offset for this key out of the cache - uint64_t globalOffset = gs_BinOffsetCache[keyIndex]; - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Get the local offset (at this point the keys are all in increasing order from 0 -> num bins in localID 0 -> thread group size) - uint64_t localOffset = localID - gs_LDSScratch[keyIndex]; - - // Write to destination - uint totalOffset = uint(globalOffset + localOffset); - - if (totalOffset < CBuffer.NumKeys) - { - DstBuffer[totalOffset] = localKey; - DstPayload[totalOffset] = localValue; - } - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Update the cached histogram for the next set of entries - if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) - gs_BinOffsetCache[localID] += gs_LocalHistogram[localID]; - - DataIndex += FFX_PARALLELSORT_THREADGROUP_SIZE; // Increase the data offset by thread group size - } - } -} - -void FFX_ParallelSort_SetupIndirectParams(uint NumKeys, uint MaxThreadGroups, RWStructuredBuffer CBuffer, RWStructuredBuffer CountScatterArgs, RWStructuredBuffer ReduceScanArgs) -{ - CBuffer[0].NumKeys = NumKeys; - - uint BlockSize = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; - uint NumBlocks = (NumKeys + BlockSize - 1) / BlockSize; - - // Figure out data distribution - uint NumThreadGroupsToRun = MaxThreadGroups; - uint BlocksPerThreadGroup = (NumBlocks / NumThreadGroupsToRun); - CBuffer[0].NumThreadGroupsWithAdditionalBlocks = NumBlocks % NumThreadGroupsToRun; - - if (NumBlocks < NumThreadGroupsToRun) - { - BlocksPerThreadGroup = 1; - NumThreadGroupsToRun = NumBlocks; - CBuffer[0].NumThreadGroupsWithAdditionalBlocks = 0; - } - - CBuffer[0].NumThreadGroups = NumThreadGroupsToRun; - CBuffer[0].NumBlocksPerThreadGroup = BlocksPerThreadGroup; - - // Calculate the number of thread groups to run for reduction (each thread group can process BlockSize number of entries) - uint NumReducedThreadGroupsToRun = FFX_PARALLELSORT_SORT_BIN_COUNT * ((BlockSize > NumThreadGroupsToRun) ? 1 : (NumThreadGroupsToRun + BlockSize - 1) / BlockSize); - CBuffer[0].NumReduceThreadgroupPerBin = NumReducedThreadGroupsToRun / FFX_PARALLELSORT_SORT_BIN_COUNT; - CBuffer[0].NumScanValues = NumReducedThreadGroupsToRun; // The number of reduce thread groups becomes our scan count (as each thread group writes out 1 value that needs scan prefix) - - // Setup dispatch arguments - CountScatterArgs[0] = NumThreadGroupsToRun; - CountScatterArgs[1] = 1; - CountScatterArgs[2] = 1; - - ReduceScanArgs[0] = NumReducedThreadGroupsToRun; - ReduceScanArgs[1] = 1; - ReduceScanArgs[2] = 1; -} diff --git a/Source/Shaders/Shaders/Sorting/Radix/Constants.inc.slang b/Source/Shaders/Shaders/Sorting/Radix/Constants.inc.slang new file mode 100644 index 00000000..475c0d80 --- /dev/null +++ b/Source/Shaders/Shaders/Sorting/Radix/Constants.inc.slang @@ -0,0 +1,7 @@ +#pragma once + +static const uint RADIX = 256; +static const uint WORKGROUP_SIZE = 512; +static const uint PARTITION_DIVISION = 8; +static const uint PARTITION_SIZE = PARTITION_DIVISION * WORKGROUP_SIZE; +static const uint MAX_SUBGROUP_SIZE = 128; diff --git a/Source/Shaders/Shaders/Sorting/Radix/Downsweep.cs.slang b/Source/Shaders/Shaders/Sorting/Radix/Downsweep.cs.slang new file mode 100644 index 00000000..149cc652 --- /dev/null +++ b/Source/Shaders/Shaders/Sorting/Radix/Downsweep.cs.slang @@ -0,0 +1,235 @@ +permutation KEY_VALUE = [0, 1]; +// Reduce-then-scan radix sort -- downsweep pass. +// Consumes the scanned histograms to scatter keys (and values, if KEY_VALUE=1) into their +// globally sorted positions. Dispatched with `(partitionCount, 1, 1)` groups, WORKGROUP_SIZE +// threads each. + +#include "Sorting/Radix/Constants.inc.slang" + +struct RadixPushConstants +{ + uint pass; + uint elementCount; +}; +[[vk::push_constant]] RadixPushConstants _radixPC; + +[[vk::binding(1, PER_PASS)]] RWStructuredBuffer globalHistogram; +[[vk::binding(2, PER_PASS)]] RWStructuredBuffer partitionHistogram; +[[vk::binding(3, PER_PASS)]] RWStructuredBuffer keysIn; +[[vk::binding(4, PER_PASS)]] RWStructuredBuffer keysOut; +#if KEY_VALUE +[[vk::binding(5, PER_PASS)]] RWStructuredBuffer valuesIn; +[[vk::binding(6, PER_PASS)]] RWStructuredBuffer valuesOut; +#endif + +groupshared uint localHistogram[PARTITION_SIZE]; // (R, S=16)=4096, (P) for alias. take maximum. +groupshared uint localHistogramSum[RADIX]; + +// returns 0b00000....11111, where msb is id-1. +uint4 GetExclusiveWaveMask(uint id) +{ + // clamp bit-shift right operand between 0..31 to avoid undefined behavior. + uint shift = (1 << bitfieldExtract(id, 0, 5)) - 1; // (1 << (id % 32)) - 1 + // right shift operation on signed integer copies sign bit, use the trick for masking. + // (negative) >> 31 = 111...111 + // (non-negative) >> 31 = 000...000 + int x = int(id) >> 5; + return uint4((shift & ((-1 - x) >> 31)) | ((0 - x) >> 31), // + (shift & ((0 - x) >> 31)) | ((1 - x) >> 31), // + (shift & ((1 - x) >> 31)) | ((2 - x) >> 31), // + (shift & ((2 - x) >> 31)) | ((3 - x) >> 31)); +} + +uint GetBitCount(uint4 value) +{ + uint4 result = countbits(value); + return result[0] + result[1] + result[2] + result[3]; +} + +[shader("compute")] +[numthreads(WORKGROUP_SIZE)] +void main(uint3 groupThreadID: SV_GroupThreadID, uint3 groupId: SV_GroupID, + uint groupIndex: SV_GroupIndex) +{ + const uint pass = _radixPC.pass; + const uint elementCount = _radixPC.elementCount; + + uint laneIndex = WaveGetLaneIndex(); // 0..31 or 0..63 + uint laneCount = WaveGetLaneCount(); // 32 or 64 + uint waveIndex = groupIndex / laneCount; // 0..15 or 0..7 + uint waveCount = WORKGROUP_SIZE / laneCount; // 32 or 16 + uint index = waveIndex * laneCount + laneIndex; + + uint4 waveMask = GetExclusiveWaveMask(laneIndex); + + uint partitionIndex = groupId.x; + uint partitionStart = partitionIndex * PARTITION_SIZE; + + if (partitionStart >= elementCount) + return; + + if (index < RADIX) { + for (int i = 0; i < waveCount; ++i) { + localHistogram[waveCount * index + i] = 0; + } + } + GroupMemoryBarrierWithGroupSync(); + + // load from global memory, local histogram and offset + uint localKeys[PARTITION_DIVISION]; + uint localRadix[PARTITION_DIVISION]; + uint localOffsets[PARTITION_DIVISION]; + uint waveHistogram[PARTITION_DIVISION]; +#if KEY_VALUE + uint localValues[PARTITION_DIVISION]; +#endif + + [ForceUnroll] + for (int i = 0; i < PARTITION_DIVISION; ++i) { + uint keyIndex = + partitionStart + (PARTITION_DIVISION * laneCount) * waveIndex + i * laneCount + laneIndex; + uint key = keyIndex < elementCount ? keysIn[keyIndex] : 0xffffffff; + localKeys[i] = key; + +#if KEY_VALUE + localValues[i] = keyIndex < elementCount ? valuesIn[keyIndex] : 0; +#endif + + uint radix = bitfieldExtract(key, pass * 8, 8); + localRadix[i] = radix; + + // mask per digit + uint4 mask = WaveActiveBallot(true); + [ForceUnroll] + for (int j = 0; j < 8; ++j) { + uint digit = (radix >> j) & 1; + uint4 ballot = WaveActiveBallot(digit == 1); + // digit - 1 is 0 or 0xffffffff. xor to flip. + mask &= uint4(digit - 1) ^ ballot; + } + + // wave level offset for radix + uint waveOffset = GetBitCount(waveMask & mask); + uint radixCount = GetBitCount(mask); + + // elect a representative per radix, add to histogram + if (waveOffset == 0) { + // accumulate to local histogram + __atomic_add(localHistogram[waveCount * radix + waveIndex], radixCount, MemoryOrder.Relaxed); + waveHistogram[i] = radixCount; + } else { + waveHistogram[i] = 0; + } + + localOffsets[i] = waveOffset; + } + GroupMemoryBarrierWithGroupSync(); + + // local histogram reduce 4096 or 2048 + for (uint i = index; i < RADIX * waveCount; i += WORKGROUP_SIZE) { + uint v = localHistogram[i]; + uint sum = WaveActiveSum(v); + uint excl = WavePrefixSum(v); + localHistogram[i] = excl; + if (laneIndex == 0) { + localHistogramSum[i / laneCount] = sum; + } + } + GroupMemoryBarrierWithGroupSync(); + + // local histogram reduce 128 or 32 + uint intermediateOffset0 = RADIX * waveCount / laneCount; + if (index < intermediateOffset0) { + uint v = localHistogramSum[index]; + uint sum = WaveActiveSum(v); + uint excl = WavePrefixSum(v); + localHistogramSum[index] = excl; + if (laneIndex == 0) { + localHistogramSum[intermediateOffset0 + index / laneCount] = sum; + } + } + GroupMemoryBarrierWithGroupSync(); + + // local histogram reduce 4 or 1 + uint intermediateSize1 = max(RADIX * waveCount / laneCount / laneCount, 1); + if (index < intermediateSize1) { + uint v = localHistogramSum[intermediateOffset0 + index]; + uint excl = WavePrefixSum(v); + localHistogramSum[intermediateOffset0 + index] = excl; + } + GroupMemoryBarrierWithGroupSync(); + + // local histogram add 128 + if (index < intermediateOffset0) { + localHistogramSum[index] += localHistogramSum[intermediateOffset0 + index / laneCount]; + } + GroupMemoryBarrierWithGroupSync(); + + // local histogram add 4096 + for (uint i = index; i < RADIX * waveCount; i += WORKGROUP_SIZE) { + localHistogram[i] += localHistogramSum[i / laneCount]; + } + GroupMemoryBarrierWithGroupSync(); + + // post-scan stage + [ForceUnroll] + for (int i = 0; i < PARTITION_DIVISION; ++i) { + uint radix = localRadix[i]; + localOffsets[i] += localHistogram[waveCount * radix + waveIndex]; + + GroupMemoryBarrierWithGroupSync(); + if (waveHistogram[i] > 0) { + __atomic_add(localHistogram[waveCount * radix + waveIndex], waveHistogram[i], + MemoryOrder.Relaxed); + } + GroupMemoryBarrierWithGroupSync(); + } + + // after atomicAdd, localHistogram contains inclusive sum + if (index < RADIX) { + uint v = index == 0 ? 0 : localHistogram[waveCount * index - 1]; + localHistogramSum[index] = globalHistogram[RADIX * pass + index] + + partitionHistogram[RADIX * partitionIndex + index] - v; + } + GroupMemoryBarrierWithGroupSync(); + + // rearrange keys. grouping keys together makes dstOffset to be almost sequential, grants huge + // speed boost. now localHistogram is unused, so alias memory. + [ForceUnroll] + for (int i = 0; i < PARTITION_DIVISION; ++i) { + localHistogram[localOffsets[i]] = localKeys[i]; + } + GroupMemoryBarrierWithGroupSync(); + + // binning + for (uint i = index; i < PARTITION_SIZE; i += WORKGROUP_SIZE) { + uint key = localHistogram[i]; + uint radix = bitfieldExtract(key, pass * 8, 8); + uint dstOffset = localHistogramSum[radix] + i; + if (dstOffset < elementCount) { + keysOut[dstOffset] = key; + } + +#if KEY_VALUE + localKeys[i / WORKGROUP_SIZE] = dstOffset; +#endif + } + +#if KEY_VALUE + GroupMemoryBarrierWithGroupSync(); + + [ForceUnroll] + for (int i = 0; i < PARTITION_DIVISION; ++i) { + localHistogram[localOffsets[i]] = localValues[i]; + } + GroupMemoryBarrierWithGroupSync(); + + for (uint i = index; i < PARTITION_SIZE; i += WORKGROUP_SIZE) { + uint value = localHistogram[i]; + uint dstOffset = localKeys[i / WORKGROUP_SIZE]; + if (dstOffset < elementCount) { + valuesOut[dstOffset] = value; + } + } +#endif +} diff --git a/Source/Shaders/Shaders/Sorting/Radix/Spine.cs.slang b/Source/Shaders/Shaders/Sorting/Radix/Spine.cs.slang new file mode 100644 index 00000000..5ac0bd3b --- /dev/null +++ b/Source/Shaders/Shaders/Sorting/Radix/Spine.cs.slang @@ -0,0 +1,97 @@ +// Reduce-then-scan radix sort -- spine pass. +// For each radix bin, prefix-scans the per-partition histograms so every partition knows its +// starting offset for every bin. Also prefix-scans the per-pass global histogram (bin 0's +// workgroup handles that). +// Dispatched with `(RADIX=256, 1, 1)` groups, WORKGROUP_SIZE threads each. + +#include "Sorting/Radix/Constants.inc.slang" + +struct RadixPushConstants +{ + uint pass; + uint elementCount; +}; +[[vk::push_constant]] RadixPushConstants _radixPC; + +[[vk::binding(1, PER_PASS)]] RWStructuredBuffer globalHistogram; +[[vk::binding(2, PER_PASS)]] RWStructuredBuffer partitionHistogram; + +groupshared uint reduction; +groupshared uint intermediate[MAX_SUBGROUP_SIZE]; + +[shader("compute")] +[numthreads(WORKGROUP_SIZE)] +void main(uint3 groupThreadID: SV_GroupThreadID, uint3 groupId: SV_GroupID, + uint groupIndex: SV_GroupIndex) +{ + const uint pass = _radixPC.pass; + const uint elementCount = _radixPC.elementCount; + + uint laneIndex = WaveGetLaneIndex(); // 0..31 + uint laneCount = WaveGetLaneCount(); // 32 + uint waveIndex = groupIndex / laneCount; + uint waveCount = WORKGROUP_SIZE / laneCount; + uint index = waveIndex * laneCount + laneIndex; + + uint radix = groupId.x; + + uint partitionCount = (elementCount + PARTITION_SIZE - 1) / PARTITION_SIZE; + + if (index == 0) { + reduction = 0; + } + GroupMemoryBarrierWithGroupSync(); + + for (uint i = 0; WORKGROUP_SIZE * i < partitionCount; ++i) { + uint partitionIndex = WORKGROUP_SIZE * i + index; + uint value = + partitionIndex < partitionCount ? partitionHistogram[RADIX * partitionIndex + radix] : 0; + uint excl = WavePrefixSum(value) + reduction; + uint sum = WaveActiveSum(value); + + if (WaveIsFirstLane()) { + intermediate[waveIndex] = sum; + } + GroupMemoryBarrierWithGroupSync(); + + if (index < waveCount) { + uint excl = WavePrefixSum(intermediate[index]); + uint sum = WaveActiveSum(intermediate[index]); + intermediate[index] = excl; + + if (index == 0) { + reduction += sum; + } + } + GroupMemoryBarrierWithGroupSync(); + + if (partitionIndex < partitionCount) { + excl += intermediate[waveIndex]; + partitionHistogram[RADIX * partitionIndex + radix] = excl; + } + GroupMemoryBarrierWithGroupSync(); + } + + if (radix == 0) { + // one workgroup is responsible for global histogram prefix sum + if (index < RADIX) { + uint value = globalHistogram[RADIX * pass + index]; + uint excl = WavePrefixSum(value); + uint sum = WaveActiveSum(value); + + if (WaveIsFirstLane()) { + intermediate[waveIndex] = sum; + } + GroupMemoryBarrierWithGroupSync(); + + if (index < RADIX / laneCount) { + uint excl = WavePrefixSum(intermediate[index]); + intermediate[index] = excl; + } + GroupMemoryBarrierWithGroupSync(); + + excl += intermediate[waveIndex]; + globalHistogram[RADIX * pass + index] = excl; + } + } +} diff --git a/Source/Shaders/Shaders/Sorting/Radix/Upsweep.cs.slang b/Source/Shaders/Shaders/Sorting/Radix/Upsweep.cs.slang new file mode 100644 index 00000000..ec38a4c5 --- /dev/null +++ b/Source/Shaders/Shaders/Sorting/Radix/Upsweep.cs.slang @@ -0,0 +1,59 @@ +// Reduce-then-scan radix sort -- upsweep pass. +// For each partition of PARTITION_SIZE keys, counts per-radix-bin histograms: +// - Writes the per-partition histogram to partitionHistogram[radix][partitionIndex] +// - Accumulates the per-pass global histogram via atomic add +// Dispatched with `(partitionCount, 1, 1)` groups, WORKGROUP_SIZE threads each. + +#include "Sorting/Radix/Constants.inc.slang" + +struct RadixPushConstants +{ + uint pass; // 0..3 for 4 passes of 8 bits over a u32 key + uint elementCount; // total valid keys (the last partition may be partial) +}; +[[vk::push_constant]] RadixPushConstants _radixPC; + +[[vk::binding(1, PER_PASS)]] RWStructuredBuffer globalHistogram; +[[vk::binding(2, PER_PASS)]] RWStructuredBuffer partitionHistogram; +[[vk::binding(3, PER_PASS)]] RWStructuredBuffer keys; + +groupshared uint localHistogram[RADIX]; + +[shader("compute")] +[numthreads(WORKGROUP_SIZE)] +void main(uint3 groupThreadID: SV_GroupThreadID, uint3 groupId: SV_GroupID) +{ + const uint pass = _radixPC.pass; + const uint elementCount = _radixPC.elementCount; + + uint index = groupThreadID.x; + uint partitionIndex = groupId.x; + uint partitionStart = partitionIndex * PARTITION_SIZE; + + // discard all workgroup invocations + if (partitionStart >= elementCount) { + return; + } + + if (index < RADIX) { + localHistogram[index] = 0; + } + GroupMemoryBarrierWithGroupSync(); + + // local histogram + for (int i = 0; i < PARTITION_DIVISION; ++i) { + uint keyIndex = partitionStart + WORKGROUP_SIZE * i + index; + uint key = keyIndex < elementCount ? keys[keyIndex] : 0xffffffff; + uint radix = bitfieldExtract(key, 8 * pass, 8); + __atomic_add(localHistogram[radix], 1, MemoryOrder.Relaxed); + } + GroupMemoryBarrierWithGroupSync(); + + if (index < RADIX) { + // set to partition histogram + partitionHistogram[RADIX * partitionIndex + index] = localHistogram[index]; + + // add to global histogram + __atomic_add(globalHistogram[RADIX * pass + index], localHistogram[index], MemoryOrder.Relaxed); + } +} diff --git a/Source/Shaders/Shaders/Sorting/SortCount.cs.slang b/Source/Shaders/Shaders/Sorting/SortCount.cs.slang deleted file mode 100644 index a2ae2ce1..00000000 --- a/Source/Shaders/Shaders/Sorting/SortCount.cs.slang +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -//-------------------------------------------------------------------------------------- -// ParallelSort Shaders/Includes -//-------------------------------------------------------------------------------------- -#include "Sorting/FFX_ParallelSort.inc.slang" - -struct CountCB -{ - uint shiftBits; -}; - -[[vk::push_constant]] CountCB _countCB; // Count Indirect Constant buffer - -[[vk::binding(0, PER_PASS)]] ConstantBuffer _constants; // Constant Buffer -[[vk::binding(1, PER_PASS)]] RWStructuredBuffer _keys; // The unsorted keys or scan data -[[vk::binding(2, PER_PASS)]] RWStructuredBuffer _sumTable; // The sum table we will write sums to - -[shader("compute")] -[numthreads(FFX_PARALLELSORT_THREADGROUP_SIZE, 1, 1)] -void main(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) -{ - // Call the uint version of the count part of the algorithm - FFX_ParallelSort_Count_uint(localID, groupID, _constants, _countCB.shiftBits, _keys, _sumTable); -} diff --git a/Source/Shaders/Shaders/Sorting/SortCountReduce.cs.slang b/Source/Shaders/Shaders/Sorting/SortCountReduce.cs.slang deleted file mode 100644 index ff5e797e..00000000 --- a/Source/Shaders/Shaders/Sorting/SortCountReduce.cs.slang +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -//-------------------------------------------------------------------------------------- -// ParallelSort Shaders/Includes -//-------------------------------------------------------------------------------------- -#include "Sorting/FFX_ParallelSort.inc.slang" - -[[vk::binding(0, PER_PASS)]] ConstantBuffer _constants; // Constant Buffer -[[vk::binding(1, PER_PASS)]] RWStructuredBuffer _sumTable; // The sum table we will write sums to -[[vk::binding(2, PER_PASS)]] RWStructuredBuffer _reducedSumTable; // The reduced sum table we will write sums to - -[shader("compute")] -[numthreads(FFX_PARALLELSORT_THREADGROUP_SIZE, 1, 1)] -void main(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) -{ - // Call the reduce part of the algorithm - FFX_ParallelSort_ReduceCount(localID, groupID, _constants, _sumTable, _reducedSumTable); -} diff --git a/Source/Shaders/Shaders/Sorting/SortScan.cs.slang b/Source/Shaders/Shaders/Sorting/SortScan.cs.slang deleted file mode 100644 index baf962e8..00000000 --- a/Source/Shaders/Shaders/Sorting/SortScan.cs.slang +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -//-------------------------------------------------------------------------------------- -// ParallelSort Shaders/Includes -//-------------------------------------------------------------------------------------- -#include "Sorting/FFX_ParallelSort.inc.slang" - -[[vk::binding(0, PER_PASS)]] ConstantBuffer _constants; // Constant Buffer -[[vk::binding(1, PER_PASS)]] RWStructuredBuffer _scanSrc; // Source for Scan Data -[[vk::binding(2, PER_PASS)]] RWStructuredBuffer _scanDst; // Destination for Scan Data -[[vk::binding(3, PER_PASS)]] RWStructuredBuffer _scanScratch; // Scratch data for Scan - -[shader("compute")] -[numthreads(FFX_PARALLELSORT_THREADGROUP_SIZE, 1, 1)] -void main(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) -{ - uint baseIndex = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE * groupID; - FFX_ParallelSort_ScanPrefix(_constants.NumScanValues, localID, groupID, 0, baseIndex, false, - _constants, _scanSrc, _scanDst, _scanScratch); -} \ No newline at end of file diff --git a/Source/Shaders/Shaders/Sorting/SortScanAdd.cs.slang b/Source/Shaders/Shaders/Sorting/SortScanAdd.cs.slang deleted file mode 100644 index f0242849..00000000 --- a/Source/Shaders/Shaders/Sorting/SortScanAdd.cs.slang +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -//-------------------------------------------------------------------------------------- -// ParallelSort Shaders/Includes -//-------------------------------------------------------------------------------------- -#include "Sorting/FFX_ParallelSort.inc.slang" - -[[vk::binding(0, PER_PASS)]] ConstantBuffer _constants; // Constant Buffer -[[vk::binding(1, PER_PASS)]] RWStructuredBuffer _scanSrc; // Source for Scan Data -[[vk::binding(2, PER_PASS)]] RWStructuredBuffer _scanDst; // Destination for Scan Data -[[vk::binding(3, PER_PASS)]] RWStructuredBuffer _scanScratch; // Scratch data for Scan - -[shader("compute")] -[numthreads(FFX_PARALLELSORT_THREADGROUP_SIZE, 1, 1)] -void main(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) -{ - // When doing adds, we need to access data differently because reduce - // has a more specialized access pattern to match optimized count - // Access needs to be done similarly to reduce - // Figure out what bin data we are reducing - uint binID = groupID / _constants.NumReduceThreadgroupPerBin; - uint binOffset = binID * _constants.NumThreadGroups; - - // Get the base index for this thread group - //uint BaseIndex = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE * (groupID / FFX_PARALLELSORT_SORT_BIN_COUNT); - uint baseIndex = (groupID % _constants.NumReduceThreadgroupPerBin) * FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; - - FFX_ParallelSort_ScanPrefix(_constants.NumThreadGroups, localID, groupID, binOffset, baseIndex, true, - _constants, _scanSrc, _scanDst, _scanScratch); -} \ No newline at end of file diff --git a/Source/Shaders/Shaders/Sorting/SortScatter.cs.slang b/Source/Shaders/Shaders/Sorting/SortScatter.cs.slang deleted file mode 100644 index 97446ffe..00000000 --- a/Source/Shaders/Shaders/Sorting/SortScatter.cs.slang +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -//-------------------------------------------------------------------------------------- -// ParallelSort Shaders/Includes -//-------------------------------------------------------------------------------------- -#include "Sorting/FFX_ParallelSort.inc.slang" - -struct ScatterCB -{ - uint shiftBits; -}; - -[[vk::push_constant]] ScatterCB _scatterCB; // Count Indirect Constant buffer - -[[vk::binding(0, PER_PASS)]] ConstantBuffer _constants; // Constant Buffer -[[vk::binding(1, PER_PASS)]] RWStructuredBuffer _keys; // The unsorted keys or scan data -[[vk::binding(2, PER_PASS)]] RWStructuredBuffer _values; // The payload data -[[vk::binding(3, PER_PASS)]] RWStructuredBuffer _sumTable; // The sum table we will write sums to -[[vk::binding(5, PER_PASS)]] RWStructuredBuffer _writeKeys; // The sorted keys or prefixed data -[[vk::binding(6, PER_PASS)]] RWStructuredBuffer _writeValues; // the sorted payload data - -[shader("compute")] -[numthreads(FFX_PARALLELSORT_THREADGROUP_SIZE, 1, 1)] -void main(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) -{ - FFX_ParallelSort_Scatter_uint(localID, groupID, _constants, _scatterCB.shiftBits, _keys, _writeKeys, _sumTable, _values, _writeValues); -} \ No newline at end of file diff --git a/Source/Shaders/Shaders/Sorting/SortSetupIndirectParameters.cs.slang b/Source/Shaders/Shaders/Sorting/SortSetupIndirectParameters.cs.slang deleted file mode 100644 index 302d46d3..00000000 --- a/Source/Shaders/Shaders/Sorting/SortSetupIndirectParameters.cs.slang +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -//-------------------------------------------------------------------------------------- -// ParallelSort Shaders/Includes -//-------------------------------------------------------------------------------------- -#include "Sorting/FFX_ParallelSort.inc.slang" - -struct SetupCB -{ - uint maxThreadGroups; -}; - -struct NumKeys -{ - uint numKeys; -}; - -[[vk::push_constant]] SetupCB _setupCB; // Setup Indirect Constant buffer - -[[vk::binding(0, PER_PASS)]] StructuredBuffer _numKeys; -[[vk::binding(1, PER_PASS)]] RWStructuredBuffer _constants; // UAV for constant buffer parameters for indirect execution -[[vk::binding(2, PER_PASS)]] RWStructuredBuffer _countScatterArgs; // Count and Scatter Args for indirect execution -[[vk::binding(3, PER_PASS)]] RWStructuredBuffer _reduceScanArgs; // Reduce and Scan Args for indirect execution - -[shader("compute")] -[numthreads(1, 1, 1)] -void main(uint localID : SV_GroupThreadID) -{ - FFX_ParallelSort_SetupIndirectParams(_numKeys[0].numKeys, _setupCB.maxThreadGroups, _constants, _countScatterArgs, _reduceScanArgs); -} \ No newline at end of file diff --git a/Source/Shaders/Shaders/UI/Text.ps.slang b/Source/Shaders/Shaders/UI/Text.ps.slang deleted file mode 100644 index 3fa5ac3a..00000000 --- a/Source/Shaders/Shaders/UI/Text.ps.slang +++ /dev/null @@ -1,112 +0,0 @@ - -#include "Include/Common.inc.slang" - -struct CharDrawData -{ - uint4 packed0; // x: textureIndex & charIndex, y: clipMaskTextureIndex, z: textColor, w: borderColor - float4 packed1; // x: borderSize, y: padding, zw: unitRangeXY - uint4 packed2; // x: clipRegionMinXY, y: clipRegionMaxXY, z: clipMaskRegionMinXY, w: clipMaskRegionMaxXY - int4 packed3; // x: worldPositionIndex, yzw: unused -}; -[[vk::binding(2, PER_PASS)]] StructuredBuffer _charDrawDatas; - -[[vk::binding(3, PER_PASS)]] SamplerState _sampler; -[[vk::binding(4, PER_PASS)]] Texture2D _fontTextures[4096]; -[[vk::binding(5, PER_PASS)]] Texture2D _textures[4096]; - -struct VertexOutput -{ - float4 position : SV_POSITION; - float4 uvAndScreenPos : TEXCOORD0; - uint charDrawDataID : TEXCOORD1; -}; - -float Median(float a, float b, float c) -{ - return max(min(a, b), min(max(a, b), c)); -} - -float ScreenPxRange(float2 uv, float2 unitRange) -{ - float2 screenTexSize = float2(1.0f, 1.0f) / fwidth(uv); - return max(0.5 * dot(unitRange, screenTexSize), 1.0); -} - -bool ShouldDiscard(float2 pos, float2 clipMin, float2 clipMax) -{ - // Check if the position is outside the clip rect - return pos.x < clipMin.x || pos.x > clipMax.x || pos.y < clipMin.y || pos.y > clipMax.y; -} - -[shader("fragment")] -float4 main(VertexOutput input) : SV_Target -{ - //return float4(1.0f, 0.0f, 0.0f, 0.3f); - CharDrawData drawData = _charDrawDatas[input.charDrawDataID]; - - float2 screenPos = input.uvAndScreenPos.zw; - float2 clipRegionMin = float2(f16tof32(drawData.packed2.x), f16tof32(drawData.packed2.x >> 16)); - float2 clipRegionMax = float2(f16tof32(drawData.packed2.y), f16tof32(drawData.packed2.y >> 16)); - if (ShouldDiscard(screenPos, clipRegionMin, clipRegionMax)) - { - //return float4(1.0f, 0.0f, 0.0f, 0.3f); - discard; - } - - uint textureIndex = drawData.packed0.x & 0xFFFF; - - uint packedTextColor = drawData.packed0.z; - uint packedBorderColor = drawData.packed0.w; - - float4 textColor = PackedUnormsToFloat4(packedTextColor); - float4 borderColor = PackedUnormsToFloat4(packedBorderColor); - - float borderSize = drawData.packed1.x; - float2 unitRange = drawData.packed1.zw; - - float4 distances = _fontTextures[textureIndex].Sample(_sampler, input.uvAndScreenPos.xy).rgba; - - const float roundedInlines = 0.0f; - const float roundedOutlines = 1.0f; - const float outBias = 1.0 / 4.0; - - float distMsdf = Median(distances.r, distances.g, distances.b); - float distSdf = distances.a; // mtsdf format only - distMsdf = min(distMsdf, distSdf + 0.1f); // HACK: to fix glitch in msdf near edges, see https://www.redblobgames.com/x/2404-distance-field-effects/ - - // Blend between sharp and rounded corners - float distInner = lerp(distMsdf, distSdf, roundedInlines); - float distOuter = lerp(distMsdf, distSdf, roundedOutlines); - - // Typically 0.5 is the threshold, > 0.5 is inside, < 0.5 is outside - const float threshold = 0.5f; - float width = ScreenPxRange(input.uvAndScreenPos.xy, unitRange); - - float inner = width * (distInner - threshold) + 0.5f + outBias; - float outer = width * (distOuter - threshold) + 0.5f + outBias + borderSize; - - float innerOpacity = saturate(inner); - float4 innerColor = textColor; - float outerOpacity = saturate(outer); - float4 outerColor = float4(borderColor.rgb, 1.0f); - - float4 color = (innerColor * innerOpacity) + (outerColor * (outerOpacity - innerOpacity)); - - // Apply the clipMask - float2 clipMaskRegionMin = float2(f16tof32(drawData.packed2.z), f16tof32(drawData.packed2.z >> 16)); - float2 clipMaskRegionMax = float2(f16tof32(drawData.packed2.w), f16tof32(drawData.packed2.w >> 16)); - float2 maskUV = (screenPos - clipMaskRegionMin) / (clipMaskRegionMax - clipMaskRegionMin); - - uint clipMaskTextureIndex = drawData.packed0.y; - float clipMask = _textures[clipMaskTextureIndex].Sample(_sampler, maskUV).a; - if (clipMask < 0.5f) - { - discard; - } - color.a *= clipMask; - - // Multiply the color channels by alpha for pre-multiplied alpha output - color.rgb *= color.a; - - return saturate(color); -} \ No newline at end of file diff --git a/Source/Shaders/Shaders/UI/Text.vs.slang b/Source/Shaders/Shaders/UI/Text.vs.slang deleted file mode 100644 index dcd4a6d9..00000000 --- a/Source/Shaders/Shaders/UI/Text.vs.slang +++ /dev/null @@ -1,67 +0,0 @@ - -#include "DescriptorSet/Global.inc.slang" - -[[vk::binding(0, PER_PASS)]] StructuredBuffer _vertices; -[[vk::binding(1, PER_PASS)]] StructuredBuffer _widgetWorldPositions; - -struct CharDrawData -{ - uint4 packed0; // x: textureIndex & charIndex, y: clipMaskTextureIndex, z: textColor, w: borderColor - float4 packed1; // x: borderSize, y: padding, zw: unitRangeXY - uint4 packed2; // x: clipRegionMinXY, y: clipRegionMaxXY, z: clipMaskRegionMinXY, w: clipMaskRegionMaxXY - int4 packed3; // x: worldPositionIndex, yzw: unused -}; -[[vk::binding(2, PER_PASS)]] StructuredBuffer _charDrawDatas; - -struct VertexInput -{ - uint vertexID : SV_VulkanVertexID; - uint charDrawDataID : SV_VulkanInstanceID; -}; - -struct VertexOutput -{ - float4 position : SV_POSITION; - float4 uvAndScreenPos : TEXCOORD0; - uint charDrawDataID : TEXCOORD1; -}; - -[shader("vertex")] -VertexOutput main(VertexInput input) -{ - CharDrawData charDrawData = _charDrawDatas[input.charDrawDataID]; - - uint charIndex = charDrawData.packed0.x >> 16; - - uint vertexID = input.vertexID + (charIndex * 6); // 6 vertices per character - float4 vertex = _vertices[vertexID]; - - float2 position = vertex.xy; - float2 uv = vertex.zw; - - int worldPositionIndex = charDrawData.packed3.x; - float4 finalPos; - - if (worldPositionIndex >= 0) - { - float3 worldPos = _widgetWorldPositions[worldPositionIndex].xyz; - - // Transform the world position to clip space. - float4 clipPos = mul(float4(worldPos, 1.0), _cameras[0].worldToClip); - clipPos.xyz /= clipPos.w; // Perform perspective division. - - finalPos = float4(clipPos.xy + position, 0.0, 1.0); - } - else - { - finalPos = float4(position, 0.0, 1.0); - } - - VertexOutput output; - output.position = finalPos; - float2 screenPos = (finalPos.xy + 1.0f) * 0.5f; - output.uvAndScreenPos = float4(uv, screenPos); - output.charDrawDataID = input.charDrawDataID; - - return output; -} \ No newline at end of file diff --git a/Source/Shaders/Shaders/UI/Panel.ps.slang b/Source/Shaders/Shaders/UI/Widget.ps.slang similarity index 50% rename from Source/Shaders/Shaders/UI/Panel.ps.slang rename to Source/Shaders/Shaders/UI/Widget.ps.slang index 454a2766..bc648d90 100644 --- a/Source/Shaders/Shaders/UI/Panel.ps.slang +++ b/Source/Shaders/Shaders/UI/Widget.ps.slang @@ -1,56 +1,57 @@ #include "Include/Common.inc.slang" -struct PanelDrawData +#define WIDGET_TYPE_PANEL 0u +#define WIDGET_TYPE_TEXT 1u + +struct WidgetDrawData { - uint4 packed0; // x: textureIndex & additiveTextureIndex, y: clipMaskTextureIndex, z: color, w: textureScaleToWidgetSizeXY - float4 texCoord; - float4 slicingCoord; - float4 cornerRadiusAndBorder; // xy: cornerRadius, zw: border - uint4 packed1; // x: clipRegionMinXY, y: clipRegionMaxXY, z: clipMaskRegionMinXY, w: clipMaskRegionMaxXY - int4 packed2; // x: worldPositionIndex, y: half2 anchorPos, z: half2 relativePos + uint4 packed0; // x: type, y: vertexBase, z: clipMaskTextureIndex, w: worldPositionIndex (int reinterpret) + uint4 packed1; // Panel: x: textureIndex & additiveTextureIndex, z: color, w: textureScaleToWidgetSize (half2). Text: x: fontTextureIndex, z: textColor, w: borderColor + float4 texCoord; // Panel only + float4 slicingCoord; // Panel only + float4 cornerRadiusAndBorder; // Panel: xy: cornerRadius. Text: x: borderSize, zw: unitRange + uint4 packed2; // x: clipRegionMinXY, y: clipRegionMaxXY, z: clipMaskRegionMinXY, w: clipMaskRegionMaxXY }; -[[vk::binding(2, PER_PASS)]] StructuredBuffer _panelDrawDatas; +[[vk::binding(2, PER_PASS)]] StructuredBuffer _widgetDrawDatas; [[vk::binding(3, PER_PASS)]] SamplerState _sampler; [[vk::binding(4, PER_PASS)]] Texture2D _textures[4096]; +[[vk::binding(5, PER_PASS)]] Texture2D _fontTextures[4096]; float NineSliceAxis(float coord, float pixelSizeUV, float texCoordMin, float texCoordMax, float borderSizeMin, float borderSizeMax) { - /* Original Code - float scaledBorderMin = texCoordMin + (borderSizeMin * pixelSizeUV); - if (coord < scaledBorderMin) // Min - return Map(coord, texCoordMin, scaledBorderMin, texCoordMin, texCoordMin + borderSizeMin); - - float scaledBorderMax = texCoordMax - (borderSizeMax * pixelSizeUV); - if (coord < scaledBorderMax) // Center - return Map(coord, scaledBorderMin, scaledBorderMax, texCoordMin + borderSizeMin, texCoordMax - borderSizeMax); - - // Max - return Map(coord, scaledBorderMax, texCoordMax, texCoordMax - borderSizeMax, texCoordMax); - */ - // Branchless Version float scaledBorderMin = texCoordMin + (borderSizeMin * pixelSizeUV); float scaledBorderMax = texCoordMax - (borderSizeMax * pixelSizeUV); - + bool isBorderMin = coord < scaledBorderMin; bool isCenter = !isBorderMin && coord < scaledBorderMax; bool isBorderMax = !isBorderMin && !isCenter; - + float originalMin = (texCoordMin * isBorderMin) + (scaledBorderMin * isCenter) + (scaledBorderMax * isBorderMax); float originalMax = (scaledBorderMin * isBorderMin) + (scaledBorderMax * isCenter) + (texCoordMax * isBorderMax); float newMin = (texCoordMin * isBorderMin) + ((texCoordMin + borderSizeMin) * isCenter) + ((texCoordMax - borderSizeMax) * isBorderMax); float newMax = ((texCoordMin + borderSizeMin) * isBorderMin) + ((texCoordMax - borderSizeMax) * isCenter) + (texCoordMax * isBorderMax); - + return Map(coord, originalMin, originalMax, newMin, newMax); } bool ShouldDiscard(float2 pos, float2 clipMin, float2 clipMax) { - // Check if the position is outside the clip rect return pos.x < clipMin.x || pos.x > clipMax.x || pos.y < clipMin.y || pos.y > clipMax.y; } +float Median(float a, float b, float c) +{ + return max(min(a, b), min(max(a, b), c)); +} + +float ScreenPxRange(float2 uv, float2 unitRange) +{ + float2 screenTexSize = float2(1.0f, 1.0f) / fwidth(uv); + return max(0.5 * dot(unitRange, screenTexSize), 1.0); +} + struct VertexOutput { float4 position : SV_POSITION; @@ -58,20 +59,8 @@ struct VertexOutput nointerpolation uint drawDataID : TEXCOORD1; }; -[shader("fragment")] -float4 main(VertexOutput input) : SV_Target +float4 ShadePanel(WidgetDrawData drawData, VertexOutput input) { - PanelDrawData drawData = _panelDrawDatas[input.drawDataID]; - - float2 screenPos = input.uvAndScreenPos.zw; - float2 clipRegionMin = float2(f16tof32(drawData.packed1.x), f16tof32(drawData.packed1.x >> 16)); - float2 clipRegionMax = float2(f16tof32(drawData.packed1.y), f16tof32(drawData.packed1.y >> 16)); - if (ShouldDiscard(screenPos, clipRegionMin, clipRegionMax)) - { - //return float4(1, 0, 0, 0.3f); - discard; - } - float2 uv = input.uvAndScreenPos.xy; float2 texCoordMin = drawData.texCoord.xy; float2 texCoordMax = drawData.texCoord.zw; @@ -80,8 +69,8 @@ float4 main(VertexOutput input) : SV_Target float2 borderSizeLeftTop = slicingCoordMin - texCoordMin; float2 borderSizeRightBottom = texCoordMax - slicingCoordMax; - - uint packedTextureScaleToWidgetSize = drawData.packed0.w; + + uint packedTextureScaleToWidgetSize = drawData.packed1.w; float2 scale = float2(f16tof32(packedTextureScaleToWidgetSize), f16tof32(packedTextureScaleToWidgetSize >> 16)); float2 scaledUV = float2( @@ -89,41 +78,34 @@ float4 main(VertexOutput input) : SV_Target NineSliceAxis(input.uvAndScreenPos.y, scale.y, texCoordMin.y, texCoordMax.y, borderSizeLeftTop.y, borderSizeRightBottom.y) ); - uint textureIndex = drawData.packed0.x & 0xFFFF; - uint additiveTextureIndex = drawData.packed0.x >> 16; - uint packedColor = drawData.packed0.z; - + uint textureIndex = drawData.packed1.x & 0xFFFF; + uint additiveTextureIndex = drawData.packed1.x >> 16; + uint packedColor = drawData.packed1.z; + float4 colorMultiplier = PackedUnormsToFloat4(packedColor); float4 color = _textures[textureIndex].Sample(_sampler, scaledUV); color *= colorMultiplier; float4 additiveColor = _textures[additiveTextureIndex].Sample(_sampler, scaledUV); - float additiveIntensity = dot(additiveColor.rgb, float3(0.299, 0.587, 0.114)) * 2.5f; // Constants from https://en.wikipedia.org/wiki/Grayscale#Luma_coding_in_video_systems + float additiveIntensity = dot(additiveColor.rgb, float3(0.299, 0.587, 0.114)) * 2.5f; additiveIntensity = saturate(additiveIntensity); - // Add the additive color to the base color color.rgb += additiveColor.rgb; - - // Blend in the intensity color.a = max(color.a, additiveIntensity); - float2 cornerRadius = drawData.cornerRadiusAndBorder.xy; // Specified in UV space + float2 cornerRadius = drawData.cornerRadiusAndBorder.xy; // Calculate distance to nearest edge float2 edgeDist = min(uv, 1.0 - uv); - // Check if cornerRadius is greater than zero if (cornerRadius.x > 0 && cornerRadius.y > 0) { - // Check if within the rounded corner area if (edgeDist.x < cornerRadius.x && edgeDist.y < cornerRadius.y) { - // Calculate distance from the corner using an elliptical formula float2 normalizedDist = 1.0 - ((edgeDist) / cornerRadius); float distToCorner = length(normalizedDist); - // Discard pixel if it's outside the rounded corner radius if (distToCorner > 1.0) { discard; @@ -131,12 +113,81 @@ float4 main(VertexOutput input) : SV_Target } } - // Apply the clipMask - float2 clipMaskRegionMin = float2(f16tof32(drawData.packed1.z), f16tof32(drawData.packed1.z >> 16)); - float2 clipMaskRegionMax = float2(f16tof32(drawData.packed1.w), f16tof32(drawData.packed1.w >> 16)); + return color; +} + +float4 ShadeText(WidgetDrawData drawData, VertexOutput input) +{ + uint fontTextureIndex = drawData.packed1.x & 0xFFFF; + + uint packedTextColor = drawData.packed1.z; + uint packedBorderColor = drawData.packed1.w; + + float4 textColor = PackedUnormsToFloat4(packedTextColor); + float4 borderColor = PackedUnormsToFloat4(packedBorderColor); + + float borderSize = drawData.cornerRadiusAndBorder.x; + float2 unitRange = drawData.cornerRadiusAndBorder.zw; + + float4 distances = _fontTextures[fontTextureIndex].Sample(_sampler, input.uvAndScreenPos.xy).rgba; + + const float roundedInlines = 0.0f; + const float roundedOutlines = 1.0f; + const float outBias = 1.0 / 4.0; + + float distMsdf = Median(distances.r, distances.g, distances.b); + float distSdf = distances.a; // mtsdf format only + distMsdf = min(distMsdf, distSdf + 0.1f); // HACK: to fix glitch in msdf near edges, see https://www.redblobgames.com/x/2404-distance-field-effects/ + + // Blend between sharp and rounded corners + float distInner = lerp(distMsdf, distSdf, roundedInlines); + float distOuter = lerp(distMsdf, distSdf, roundedOutlines); + + const float threshold = 0.5f; + float width = ScreenPxRange(input.uvAndScreenPos.xy, unitRange); + + float inner = width * (distInner - threshold) + 0.5f + outBias; + float outer = width * (distOuter - threshold) + 0.5f + outBias + borderSize; + + float innerOpacity = saturate(inner); + float4 innerColor = textColor; + float outerOpacity = saturate(outer); + float4 outerColor = float4(borderColor.rgb, 1.0f); + + return (innerColor * innerOpacity) + (outerColor * (outerOpacity - innerOpacity)); +} + +[shader("fragment")] +float4 main(VertexOutput input) : SV_Target +{ + WidgetDrawData drawData = _widgetDrawDatas[input.drawDataID]; + + // Shared clip-region discard + float2 screenPos = input.uvAndScreenPos.zw; + float2 clipRegionMin = float2(f16tof32(drawData.packed2.x), f16tof32(drawData.packed2.x >> 16)); + float2 clipRegionMax = float2(f16tof32(drawData.packed2.y), f16tof32(drawData.packed2.y >> 16)); + if (ShouldDiscard(screenPos, clipRegionMin, clipRegionMax)) + { + discard; + } + + uint type = drawData.packed0.x; + float4 color; + if (type == WIDGET_TYPE_PANEL) + { + color = ShadePanel(drawData, input); + } + else // WIDGET_TYPE_TEXT + { + color = ShadeText(drawData, input); + } + + // Shared clipMask + float2 clipMaskRegionMin = float2(f16tof32(drawData.packed2.z), f16tof32(drawData.packed2.z >> 16)); + float2 clipMaskRegionMax = float2(f16tof32(drawData.packed2.w), f16tof32(drawData.packed2.w >> 16)); float2 maskUV = (screenPos - clipMaskRegionMin) / (clipMaskRegionMax - clipMaskRegionMin); - uint clipMaskTextureIndex = drawData.packed0.y; + uint clipMaskTextureIndex = drawData.packed0.z; float clipMask = _textures[clipMaskTextureIndex].Sample(_sampler, maskUV).a; if (clipMask < 0.5f) { @@ -148,4 +199,4 @@ float4 main(VertexOutput input) : SV_Target color.rgb *= color.a; return saturate(color); -} \ No newline at end of file +} diff --git a/Source/Shaders/Shaders/UI/Panel.vs.slang b/Source/Shaders/Shaders/UI/Widget.vs.slang similarity index 59% rename from Source/Shaders/Shaders/UI/Panel.vs.slang rename to Source/Shaders/Shaders/UI/Widget.vs.slang index c46a0b7f..ffb15488 100644 --- a/Source/Shaders/Shaders/UI/Panel.vs.slang +++ b/Source/Shaders/Shaders/UI/Widget.vs.slang @@ -4,16 +4,16 @@ [[vk::binding(0, PER_PASS)]] StructuredBuffer _vertices; [[vk::binding(1, PER_PASS)]] StructuredBuffer _widgetWorldPositions; -struct PanelDrawData +struct WidgetDrawData { - uint4 packed0; // x: textureIndex & additiveTextureIndex, y: clipMaskTextureIndex, z: color, w: textureScaleToWidgetSizeXY - float4 texCoord; - float4 slicingCoord; - float4 cornerRadiusAndBorder; // xy: cornerRadius, zw: border - uint4 packed1; // x: clipRegionMinXY, y: clipRegionMaxXY, z: clipMaskRegionMinXY, w: clipMaskRegionMaxXY - int4 packed2; // x: worldPositionIndex, yzw: unused + uint4 packed0; // x: type, y: vertexBase, z: clipMaskTextureIndex, w: worldPositionIndex (int reinterpret) + uint4 packed1; // Panel: x: textureIndex & additiveTextureIndex, z: color, w: textureScaleToWidgetSize (half2). Text: x: fontTextureIndex, z: textColor, w: borderColor + float4 texCoord; // Panel only + float4 slicingCoord; // Panel only + float4 cornerRadiusAndBorder; // Panel: xy: cornerRadius. Text: x: borderSize, zw: unitRange + uint4 packed2; // x: clipRegionMinXY, y: clipRegionMaxXY, z: clipMaskRegionMinXY, w: clipMaskRegionMaxXY }; -[[vk::binding(2, PER_PASS)]] StructuredBuffer _panelDrawDatas; +[[vk::binding(2, PER_PASS)]] StructuredBuffer _widgetDrawDatas; struct VertexInput { @@ -31,13 +31,15 @@ struct VertexOutput [shader("vertex")] VertexOutput main(VertexInput input) { - float4 vertex = _vertices[input.vertexID]; + WidgetDrawData drawData = _widgetDrawDatas[input.drawDataID]; + + uint vertexBase = drawData.packed0.y; + float4 vertex = _vertices[vertexBase + input.vertexID]; float2 position = vertex.xy; float2 uv = vertex.zw; - PanelDrawData drawData = _panelDrawDatas[input.drawDataID]; - int worldPositionIndex = drawData.packed2.x; + int worldPositionIndex = (int)drawData.packed0.w; float4 finalPos; if (worldPositionIndex >= 0) @@ -63,4 +65,4 @@ VertexOutput main(VertexInput input) output.drawDataID = input.drawDataID; return output; -} \ No newline at end of file +} From 196b112562d146c523b305663d02d6b454f58aa8 Mon Sep 17 00:00:00 2001 From: Pursche Date: Sat, 25 Apr 2026 16:05:59 +0200 Subject: [PATCH 2/2] Update Engine submodule --- Submodules/Engine | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Submodules/Engine b/Submodules/Engine index 67e495f3..644e9784 160000 --- a/Submodules/Engine +++ b/Submodules/Engine @@ -1 +1 @@ -Subproject commit 67e495f3d28e6a30826b706f82dab69c82de65b8 +Subproject commit 644e9784925f632dd9764b4cfd29c98f42aa0760