diff --git a/.gitignore b/.gitignore index 80ee0a97..df413e5c 100644 --- a/.gitignore +++ b/.gitignore @@ -39,4 +39,6 @@ CMakeSettings.json # Exceptions .cache/ -*.patch \ No newline at end of file +*.patch +.claude/ +images/ \ No newline at end of file diff --git a/Source/Game-Lib/Game-Lib/ECS/Components/UI/Widget.h b/Source/Game-Lib/Game-Lib/ECS/Components/UI/Widget.h index 053fefa8..95b52931 100644 --- a/Source/Game-Lib/Game-Lib/ECS/Components/UI/Widget.h +++ b/Source/Game-Lib/Game-Lib/ECS/Components/UI/Widget.h @@ -39,6 +39,10 @@ namespace ECS::Components::UI WidgetFlags flags = WidgetFlags::Default; u32 worldTransformIndex = std::numeric_limits().max(); + // Packed draw-order sortkey computed by CanvasRenderer. See CanvasRenderer::DfsAssignSortKey for the layout. + // Sibling-order tiebreaker lives on SceneNode2D as siblingIndex (monotonic per-parent). + u32 sortKey = 0; + Scripting::UI::Widget* scriptWidget = nullptr; // Non mutable helper functions @@ -55,4 +59,12 @@ namespace ECS::Components::UI struct DirtyWidgetClipper {}; struct DirtyWidgetWorldTransformIndex {}; struct DestroyWidget {}; + + // Marks a canvas whose widget subtree needs its sortKeys recomputed by CanvasRenderer. + struct DirtyCanvasSort {}; + + // Registry-context singleton: set when the SET of canvases (or a canvas's layer) changes, + // so CanvasRenderer knows it needs to re-rank canvasOrder before re-running DfsAssignSortKey. + // Cleared inside CanvasRenderer::Update after RebuildCanvasOrder runs. + struct DirtyCanvasOrderFlag {}; } \ No newline at end of file diff --git a/Source/Game-Lib/Game-Lib/ECS/Util/Transform2D.h b/Source/Game-Lib/Game-Lib/ECS/Util/Transform2D.h index bc454e70..8e4042c4 100644 --- a/Source/Game-Lib/Game-Lib/ECS/Util/Transform2D.h +++ b/Source/Game-Lib/Game-Lib/ECS/Util/Transform2D.h @@ -287,8 +287,10 @@ namespace ECS::Components prevSibling->nextSibling = nextSibling; nextSibling->prevSibling = prevSibling; + // If we were the head of the list, the new head is the next sibling + // (which preserves insertion order: the second-inserted child becomes first). if (parent->firstChild == this) - parent->firstChild = prevSibling; + parent->firstChild = nextSibling; } nextSibling = nullptr; @@ -312,14 +314,20 @@ namespace ECS::Components } else { - //insert after the firstchild - nextSibling = newParent->firstChild->nextSibling; - prevSibling = newParent->firstChild; + // Append to the END of the circular sibling list (i.e. insert just before firstChild). + // This makes iteration order match insertion order, so siblings are drawn in the order they were created. + nextSibling = newParent->firstChild; + prevSibling = newParent->firstChild->prevSibling; prevSibling->nextSibling = this; nextSibling->prevSibling = this; } parent = newParent; + + // Assign a unique-within-current-siblings index. Using a monotonic counter on + // the parent rather than parent->children guarantees uniqueness even after + // detach+reattach cycles (where children decrements but nextSiblingIndex does not). + siblingIndex = newParent->nextSiblingIndex++; } //updates transform matrix of the children. does not recalculate matrix @@ -385,6 +393,21 @@ namespace ECS::Components SceneNode2D* nextSibling{}; SceneNode2D* prevSibling{}; i32 children{ 0 }; + + // Monotonic per-parent counter. Bumped each time a child is attached; used + // to assign a unique siblingIndex that never collides with concurrent siblings, + // even after detach/reattach cycles on the same parent. u32 so wraparound is + // irrelevant at any realistic UI churn rate. + u32 nextSiblingIndex{ 0 }; + // Unique index within this node's current parent. Set by SetParent. Used as + // the tiebreaker when two siblings have the same Z in the draw sort. + u32 siblingIndex{ 0 }; + + public: + u32 GetSiblingIndex() const + { + return siblingIndex; + } }; } diff --git a/Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.cpp b/Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.cpp index 16be20b0..d4fdc1e7 100644 --- a/Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.cpp +++ b/Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.cpp @@ -34,6 +34,42 @@ namespace ECS::Util { namespace UI { + entt::entity FindOwningCanvas(entt::registry* registry, entt::entity entity) + { + if (entity == entt::null) + return entt::null; + + auto* widget = registry->try_get(entity); + if (!widget) + return entt::null; + + if (widget->type == ECS::Components::UI::WidgetType::Canvas) + return entity; + + if (widget->scriptWidget) + return widget->scriptWidget->canvasEntity; + + return entt::null; + } + + void MarkCanvasSortDirty(entt::registry* registry, entt::entity canvasEntity) + { + if (canvasEntity == entt::null) + return; + registry->emplace_or_replace(canvasEntity); + } + + void MarkAllCanvasSortDirty(entt::registry* registry) + { + registry->view().each([&](entt::entity canvasEntity, auto&) + { + registry->emplace_or_replace(canvasEntity); + }); + // The canvas SET changed -> canvasOrder ranking is stale; gates the (relatively + // expensive) RebuildCanvasOrder pass next time CanvasRenderer::Update runs. + registry->ctx().emplace(); + } + entt::entity GetOrEmplaceCanvas(Scripting::UI::Widget*& widget, entt::registry* registry, const char* name, vec2 pos, ivec2 size, bool isRenderTexture) { ECS::Singletons::UISingleton& uiSingleton = registry->ctx().get(); @@ -109,6 +145,11 @@ namespace ECS::Util registry->emplace(entity); } + // A new canvas entering the system shifts canvasOrder for everyone; + // mark every canvas (including this one) so all widget sortKeys get their + // canvasOrder bits refreshed on the next CanvasRenderer::Update tick. + MarkAllCanvasSortDirty(registry); + return entity; } @@ -201,6 +242,9 @@ namespace ECS::Util eventInputInfo.onFocusEndEvent = panelTemplateComp.onFocusEndEvent; eventInputInfo.onFocusHeldEvent = panelTemplateComp.onFocusHeldEvent; + // New widget entering the tree -> owning canvas needs sort-key rebuild. + MarkCanvasSortDirty(registry, FindOwningCanvas(registry, parent)); + return entity; } @@ -285,6 +329,9 @@ namespace ECS::Util eventInputInfo.onFocusEndEvent = textTemplate.onFocusEndEvent; eventInputInfo.onFocusHeldEvent = textTemplate.onFocusHeldEvent; + // New widget entering the tree -> owning canvas needs sort-key rebuild. + MarkCanvasSortDirty(registry, FindOwningCanvas(registry, parent)); + return entity; } @@ -311,6 +358,9 @@ namespace ECS::Util widgetComp.type = ECS::Components::UI::WidgetType::Widget; widgetComp.scriptWidget = widget; + // New widget entering the tree -> owning canvas needs sort-key rebuild. + MarkCanvasSortDirty(registry, FindOwningCanvas(registry, parent)); + return entity; } @@ -319,6 +369,10 @@ namespace ECS::Util if (!registry->all_of(entity)) return false; + // Widgets leaving the tree changes the sibling set in their owning canvas. + // Mark it dirty BEFORE we mutate the scriptWidget or clear the parent, so FindOwningCanvas still resolves. + MarkCanvasSortDirty(registry, FindOwningCanvas(registry, entity)); + auto& transform2DSystem = Transform2DSystem::Get(*registry); transform2DSystem.ClearParent(entity); @@ -382,6 +436,10 @@ namespace ECS::Util CallLuaEvent(eventInputInfo->onFocusBeginEvent, Scripting::UI::UIInputEvent::FocusBegin, widget.scriptWidget); } } + + // Focus affects sortKey (priority bits), so both the previously focused and the newly focused widget's canvases need their sortKeys rebuilt. + MarkCanvasSortDirty(registry, FindOwningCanvas(registry, oldFocus)); + MarkCanvasSortDirty(registry, FindOwningCanvas(registry, entity)); } entt::entity GetFocusedWidgetEntity(entt::registry* registry) diff --git a/Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.h b/Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.h index e20e76da..a4756ae7 100644 --- a/Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.h +++ b/Source/Game-Lib/Game-Lib/ECS/Util/UIUtil.h @@ -36,6 +36,18 @@ namespace ECS::Util void FocusWidgetEntity(entt::registry* registry, entt::entity entity); entt::entity GetFocusedWidgetEntity(entt::registry* registry); + // Returns the canvas entity that owns the given widget entity (the widget itself if it IS a canvas). + // Walks the scriptWidget->canvasEntity chain; returns entt::null if the entity has no Widget component. + entt::entity FindOwningCanvas(entt::registry* registry, entt::entity entity); + + // Mark a single canvas as needing its widget sort-keys recomputed (by CanvasRenderer::Update next frame). + // Safe to call with entt::null; becomes a no-op. + void MarkCanvasSortDirty(entt::registry* registry, entt::entity canvasEntity); + + // Mark every canvas in the registry as needing sort-keys recomputed. Used when the set of canvases itself + // changes (new canvas, canvas SetLayer) so that canvasOrder bits are refreshed everywhere. + void MarkAllCanvasSortDirty(entt::registry* registry); + void RefreshText(entt::registry* registry, entt::entity entity, std::string_view newText); void RefreshTemplate(entt::registry* registry, entt::entity entity, ECS::Components::UI::EventInputInfo& eventInputInfo); void RefreshClipper(entt::registry* registry, entt::entity entity); diff --git a/Source/Game-Lib/Game-Lib/Rendering/Canvas/CanvasRenderer.cpp b/Source/Game-Lib/Game-Lib/Rendering/Canvas/CanvasRenderer.cpp index c59f07c2..c5880977 100644 --- a/Source/Game-Lib/Game-Lib/Rendering/Canvas/CanvasRenderer.cpp +++ b/Source/Game-Lib/Game-Lib/Rendering/Canvas/CanvasRenderer.cpp @@ -27,13 +27,15 @@ #include #include +#include +#include + using namespace ECS::Components::UI; void CanvasRenderer::Clear() { _vertices.Clear(); - _panelDrawDatas.Clear(); - _charDrawDatas.Clear(); + _widgetDrawDatas.Clear(); _textureNameHashToIndex.clear(); _textureIDToIndex.clear(); @@ -44,8 +46,7 @@ CanvasRenderer::CanvasRenderer(Renderer::Renderer* renderer, GameRenderer* gameR : _renderer(renderer) , _gameRenderer(gameRenderer) , _debugRenderer(debugRenderer) - , _panelDescriptorSet(Renderer::DescriptorSetSlot::PER_PASS) - , _textDescriptorSet(Renderer::DescriptorSetSlot::PER_PASS) + , _widgetDescriptorSet(Renderer::DescriptorSetSlot::PER_PASS) { CreatePermanentResources(); } @@ -67,14 +68,31 @@ void CanvasRenderer::Update(f32 deltaTime) uiRegistry->view().each([&](entt::entity entity, Widget& widget) { if (widget.type == WidgetType::Canvas) + { + // RT canvases own retained GPU buffers (finalSortedArgs + finalCount). Destroy them + // and erase the bucket so we don't leak per-canvas allocations on dynamic UI churn. + // Non-RT canvases all share _mainBucket, which is process-lifetime and not freed here. + if (uiRegistry->all_of(entity)) + { + auto it = _rtBuckets.find(entity); + if (it != _rtBuckets.end()) + { + if (it->second.finalSortedArgs != Renderer::BufferID::Invalid()) + _renderer->QueueDestroyBuffer(it->second.finalSortedArgs); + if (it->second.finalCount != Renderer::BufferID::Invalid()) + _renderer->QueueDestroyBuffer(it->second.finalCount); + _rtBuckets.erase(it); + } + } return; + } if (widget.type == WidgetType::Panel) { auto& panel = uiRegistry->get(entity); if (panel.gpuDataIndex != -1) - _panelDrawDatas.Remove(panel.gpuDataIndex); + _widgetDrawDatas.Remove(panel.gpuDataIndex); if (panel.gpuVertexIndex != -1) _vertices.Remove(panel.gpuVertexIndex, 6); @@ -84,7 +102,7 @@ void CanvasRenderer::Update(f32 deltaTime) auto& text = uiRegistry->get(entity); if (text.gpuDataIndex != -1) - _charDrawDatas.Remove(text.gpuDataIndex, text.numCharsNonWhitespace); + _widgetDrawDatas.Remove(text.gpuDataIndex, text.numCharsNonWhitespace); if (text.gpuVertexIndex != -1) _vertices.Remove(text.gpuVertexIndex, text.numCharsNonWhitespace * 6); // * 6 because 6 vertices per char @@ -220,24 +238,57 @@ void CanvasRenderer::Update(f32 deltaTime) if (_vertices.SyncToGPU(_renderer)) { - _panelDescriptorSet.Bind("_vertices", _vertices.GetBuffer()); - _textDescriptorSet.Bind("_vertices", _vertices.GetBuffer()); + _widgetDescriptorSet.Bind("_vertices", _vertices.GetBuffer()); } - if (_panelDrawDatas.SyncToGPU(_renderer)) + if (_widgetDrawDatas.SyncToGPU(_renderer)) { - _panelDescriptorSet.Bind("_panelDrawDatas", _panelDrawDatas.GetBuffer()); + _widgetDescriptorSet.Bind("_widgetDrawDatas", _widgetDrawDatas.GetBuffer()); } - if (_charDrawDatas.SyncToGPU(_renderer)) + if (_widgetWorldPositions.SyncToGPU(_renderer)) { - _textDescriptorSet.Bind("_charDrawDatas", _charDrawDatas.GetBuffer()); + _widgetDescriptorSet.Bind("_widgetWorldPositions", _widgetWorldPositions.GetBuffer()); } - if (_widgetWorldPositions.SyncToGPU(_renderer)) + // Rebuild sort-keys + refresh dirty buckets in one combined pass. + // + // DirtyCanvasSort is set by every operation that changes a canvas's draw ORDER (widget + // create/destroy, focus change, reparent). DirtyCanvasOrderFlag is a registry-context + // singleton set when the canvas SET itself changed (canvas create/destroy/SetLayer); it + // gates the (relatively expensive) RebuildCanvasOrder pass. + // + // Bucket refresh is driven by DirtyCanvasSort -- NOT DirtyCanvasTag. DirtyCanvasTag fires + // for any visual mutation (color, text content, etc.) which doesn't require a re-sort; its + // only remaining job is gating which RT canvases get re-DRAWN by AddCanvasPass. { - _panelDescriptorSet.Bind("_widgetWorldPositions", _widgetWorldPositions.GetBuffer()); - _textDescriptorSet.Bind("_widgetWorldPositions", _widgetWorldPositions.GetBuffer()); + auto dirtySortView = uiRegistry->view(); + if (dirtySortView.begin() != dirtySortView.end()) + { + if (uiRegistry->ctx().contains()) + { + RebuildCanvasOrder(uiRegistry); + uiRegistry->ctx().erase(); + } + + bool mainBucketDirty = false; + dirtySortView.each([&](entt::entity canvasEntity, Canvas&) + { + u8 canvasOrder = _canvasOrderByEntity.at(canvasEntity); + u32 traversalIndex = 0; + u8 rootPriority = ResolvePriority(uiRegistry, canvasEntity); + DfsAssignSortKey(uiRegistry, canvasEntity, canvasOrder, traversalIndex, rootPriority); + + if (uiRegistry->all_of(canvasEntity)) + RefreshBucketCPU(uiRegistry, canvasEntity, /*isRT=*/true); + else + mainBucketDirty = true; + }); + if (mainBucketDirty) + RefreshBucketCPU(uiRegistry, entt::null, /*isRT=*/false); + + uiRegistry->clear(); + } } uiRegistry->clear(); @@ -264,13 +315,22 @@ void CanvasRenderer::UpdateWorldTransform(u32 index, const vec3& position) void CanvasRenderer::AddCanvasPass(Renderer::RenderGraph* renderGraph, RenderResources& resources, u8 frameIndex) { + // --- "Canvases" (graphics) ----------------------------------------------------------------- + // Per bucket, bind its retained finalSortedArgs + finalCount and issue one DrawIndirectCount. + // finalSortedArgs is populated CPU-side by RefreshBucketCPU via std::sort + UploadToBuffer; + // this pass just consumes it. struct Data { Renderer::ImageMutableResource target; + // Per-bucket buffer resources, in the same order as _drawBuckets below. Each element i + // corresponds to a {RT canvas or main} DrawIndirectCount call. + std::vector argBuffers; + std::vector countBuffers; + std::vector bucketCanvasEntities; // entt::null for the main bucket + Renderer::DescriptorSetResource globalDescriptorSet; - Renderer::DescriptorSetResource panelDescriptorSet; - Renderer::DescriptorSetResource textDescriptorSet; + Renderer::DescriptorSetResource widgetDescriptorSet; }; renderGraph->AddPass("Canvases", [this, &resources](Data& data, Renderer::RenderGraphBuilder& builder) // Setup @@ -283,13 +343,35 @@ void CanvasRenderer::AddCanvasPass(Renderer::RenderGraph* renderGraph, RenderRes builder.Read(_vertices.GetBuffer(), BufferUsage::GRAPHICS); - builder.Read(_panelDrawDatas.GetBuffer(), BufferUsage::GRAPHICS); - builder.Read(_charDrawDatas.GetBuffer(), BufferUsage::GRAPHICS); + builder.Read(_widgetDrawDatas.GetBuffer(), BufferUsage::GRAPHICS); builder.Read(_widgetWorldPositions.GetBuffer(), BufferUsage::GRAPHICS); + // Register each drawable bucket's retained final buffers. + entt::registry* registry = ServiceLocator::GetEnttRegistries()->uiRegistry; + + // RT canvases: only dirty ones draw this frame. + registry->view().each( + [&](entt::entity canvasEntity, Canvas&) + { + auto it = _rtBuckets.find(canvasEntity); + if (it == _rtBuckets.end() || it->second.drawCount == 0) + return; + BucketResources& b = it->second; + data.argBuffers.push_back(builder.Read(b.finalSortedArgs, BufferUsage::GRAPHICS)); + data.countBuffers.push_back(builder.Read(b.finalCount, BufferUsage::GRAPHICS)); + data.bucketCanvasEntities.push_back(canvasEntity); + }); + + // Main bucket: always drawn if non-empty. + if (_mainBucket.drawCount > 0) + { + data.argBuffers.push_back(builder.Read(_mainBucket.finalSortedArgs, BufferUsage::GRAPHICS)); + data.countBuffers.push_back(builder.Read(_mainBucket.finalCount, BufferUsage::GRAPHICS)); + data.bucketCanvasEntities.push_back(entt::null); + } + data.globalDescriptorSet = builder.Use(resources.globalDescriptorSet); - data.panelDescriptorSet = builder.Use(_panelDescriptorSet); - data.textDescriptorSet = builder.Use(_textDescriptorSet); + data.widgetDescriptorSet = builder.Use(_widgetDescriptorSet); return true;// Return true from setup to enable this pass, return false to disable it }, @@ -297,152 +379,82 @@ void CanvasRenderer::AddCanvasPass(Renderer::RenderGraph* renderGraph, RenderRes { GPU_SCOPED_PROFILER_ZONE(commandList, DebugRender2D); entt::registry* registry = ServiceLocator::GetEnttRegistries()->uiRegistry; - auto& transform2DSystem = ECS::Transform2DSystem::Get(*registry); - - Renderer::GraphicsPipelineID currentPipeline; - _lastRenderedWidgetType = WidgetType::None; - - // Loop over dirty rendertarget canvases - registry->view().each([&](auto entity, auto& canvas) - { - Renderer::TextureBaseDesc textureDesc = _renderer->GetDesc(canvas.renderTexture); - commandList.SetViewport(0, 0, static_cast(textureDesc.width), static_cast(textureDesc.height), 0.0f, 1.0f); - commandList.SetScissorRect(0, static_cast(textureDesc.width), 0, static_cast(textureDesc.height)); - - Renderer::TextureRenderPassDesc renderPassDesc; - renderPassDesc.renderTargets[0] = canvas.renderTexture; - renderPassDesc.clearRenderTargets[0] = true; - bool hasDrawn = false; - - // Loop over children recursively (depth first) - transform2DSystem.IterateChildrenRecursiveDepth(entity, [&, registry](auto childEntity) - { - auto& transform = registry->get(childEntity); - auto& childWidget = registry->get(childEntity); - - if (!childWidget.IsVisible()) - return false; // Skip invisible widgets - - if (childWidget.type == WidgetType::Canvas) - return true; // There is nothing to draw for a canvas - - if (!hasDrawn) - { - commandList.PushMarker("RT Canvas: " + canvas.name, Color::PastelOrange); - commandList.BeginRenderPass(renderPassDesc); - hasDrawn = true; - } - - if (ChangePipelineIfNecessary(commandList, currentPipeline, childWidget.type)) - { - if (childWidget.type == WidgetType::Panel) - { - commandList.BindDescriptorSet(data.panelDescriptorSet, frameIndex); - } - else if (childWidget.type == WidgetType::Text) - { - commandList.BindDescriptorSet(data.textDescriptorSet, frameIndex); - } - } - - if (childWidget.type == WidgetType::Panel) - { - auto& panel = registry->get(childEntity); - RenderPanel(commandList, transform, childWidget, panel); - } - else if (childWidget.type == WidgetType::Text) - { - auto& text = registry->get(childEntity); - if (text.numCharsNonWhitespace > 0) - { - RenderText(commandList, transform, childWidget, text); - } - } - - return true; - }); - - if (hasDrawn) - { - commandList.EndPipeline(currentPipeline); - commandList.EndRenderPass(renderPassDesc); - commandList.PopMarker(); - } - }); - - _lastRenderedWidgetType = WidgetType::None; vec2 renderSize = _renderer->GetRenderSize(); - commandList.SetViewport(0, 0, renderSize.x, renderSize.y, 0.0f, 1.0f); - commandList.SetScissorRect(0, static_cast(renderSize.x), 0, static_cast(renderSize.y)); - // Loop over regular canvases + // Single instance, used for the main bucket's BeginRenderPass during the loop AND for + // EndRenderPass after the loop. Avoids the previous "init this struct three separate + // times" dance. Renderer::RenderPassDesc mainRenderPassDesc; graphResources.InitializeRenderPassDesc(mainRenderPassDesc); mainRenderPassDesc.renderTargets[0] = data.target; - commandList.BeginRenderPass(mainRenderPassDesc); - registry->view(entt::exclude).each([&](auto entity, auto& canvas) + bool mainRenderPassOpen = false; + + for (size_t i = 0; i < data.bucketCanvasEntities.size(); ++i) { - bool hasDrawn = false; + entt::entity canvasEntity = data.bucketCanvasEntities[i]; + const bool isMain = (canvasEntity == entt::null); - // Loop over children recursively (depth first) - transform2DSystem.IterateChildrenRecursiveDepth(entity, [&, registry](auto childEntity) + u32 drawCount = 0; + if (isMain) { - auto& transform = registry->get(childEntity); - auto& childWidget = registry->get(childEntity); - - if (!childWidget.IsVisible()) - return false; // Skip invisible widgets - - if (childWidget.type == WidgetType::Canvas) - return true; // There is nothing to draw for a canvas - - if (!hasDrawn) - { - commandList.PushMarker("Canvas: " + canvas.name, Color::PastelOrange); - hasDrawn = true; - } - - if (ChangePipelineIfNecessary(commandList, currentPipeline, childWidget.type)) - { - commandList.BindDescriptorSet(data.globalDescriptorSet, frameIndex); - if (childWidget.type == WidgetType::Panel) - { - commandList.BindDescriptorSet(data.panelDescriptorSet, frameIndex); - } - else if (childWidget.type == WidgetType::Text) - { - commandList.BindDescriptorSet(data.textDescriptorSet, frameIndex); - } - } - - if (childWidget.type == WidgetType::Panel) - { - auto& panel = registry->get(childEntity); - RenderPanel(commandList, transform, childWidget, panel); - } - else if (childWidget.type == WidgetType::Text) - { - auto& text = registry->get(childEntity); - if (text.numCharsNonWhitespace > 0) - { - RenderText(commandList, transform, childWidget, text); - } - } - - return true; - }); + drawCount = _mainBucket.drawCount; + } + else + { + auto it = _rtBuckets.find(canvasEntity); + drawCount = (it == _rtBuckets.end()) ? 0 : it->second.drawCount; + } + if (drawCount == 0) + continue; - if (hasDrawn) + if (!isMain) { + auto& canvas = registry->get(canvasEntity); + Renderer::TextureBaseDesc textureDesc = _renderer->GetDesc(canvas.renderTexture); + commandList.SetViewport(0, 0, static_cast(textureDesc.width), static_cast(textureDesc.height), 0.0f, 1.0f); + commandList.SetScissorRect(0, static_cast(textureDesc.width), 0, static_cast(textureDesc.height)); + + Renderer::TextureRenderPassDesc renderPassDesc; + renderPassDesc.renderTargets[0] = canvas.renderTexture; + renderPassDesc.clearRenderTargets[0] = true; + + commandList.PushMarker("RT Canvas: " + canvas.name, Color::PastelOrange); + commandList.BeginRenderPass(renderPassDesc); + commandList.BeginPipeline(_widgetPipeline); + commandList.BindDescriptorSet(data.globalDescriptorSet, frameIndex); + commandList.BindDescriptorSet(data.widgetDescriptorSet, frameIndex); + commandList.DrawIndirectCount(data.argBuffers[i], 0, data.countBuffers[i], 0, drawCount); + commandList.EndPipeline(_widgetPipeline); + commandList.EndRenderPass(renderPassDesc); commandList.PopMarker(); } - }); + else + { + if (!mainRenderPassOpen) + { + commandList.SetViewport(0, 0, renderSize.x, renderSize.y, 0.0f, 1.0f); + commandList.SetScissorRect(0, static_cast(renderSize.x), 0, static_cast(renderSize.y)); + commandList.BeginRenderPass(mainRenderPassDesc); + mainRenderPassOpen = true; + } + commandList.BeginPipeline(_widgetPipeline); + commandList.BindDescriptorSet(data.globalDescriptorSet, frameIndex); + commandList.BindDescriptorSet(data.widgetDescriptorSet, frameIndex); + commandList.DrawIndirectCount(data.argBuffers[i], 0, data.countBuffers[i], 0, drawCount); + commandList.EndPipeline(_widgetPipeline); + } + } - if (_lastRenderedWidgetType != WidgetType::None) + // Always end the frame with the main render pass closed. If nothing got drawn into + // main (zero non-RT canvas draws), still open+close so downstream passes see a clean + // sceneColor attachment state. + if (!mainRenderPassOpen) { - commandList.EndPipeline(currentPipeline); + commandList.SetViewport(0, 0, renderSize.x, renderSize.y, 0.0f, 1.0f); + commandList.SetScissorRect(0, static_cast(renderSize.x), 0, static_cast(renderSize.y)); + commandList.BeginRenderPass(mainRenderPassDesc); } commandList.EndRenderPass(mainRenderPassDesc); @@ -459,8 +471,7 @@ void CanvasRenderer::CreatePermanentResources() textureArrayDesc.size = 4096; _textures = _renderer->CreateTextureArray(textureArrayDesc); - _panelDescriptorSet.Bind("_textures", _textures); - _textDescriptorSet.Bind("_textures", _textures); + _widgetDescriptorSet.Bind("_textures", _textures); Renderer::DataTextureDesc dataTextureDesc; dataTextureDesc.width = 1; @@ -491,8 +502,7 @@ void CanvasRenderer::CreatePermanentResources() samplerDesc.shaderVisibility = Renderer::ShaderVisibility::PIXEL; _sampler = _renderer->CreateSampler(samplerDesc); - _panelDescriptorSet.Bind("_sampler"_h, _sampler); - _textDescriptorSet.Bind("_sampler"_h, _sampler); + _widgetDescriptorSet.Bind("_sampler"_h, _sampler); textureArrayDesc.size = 256; _fontTextures = _renderer->CreateTextureArray(textureArrayDesc); @@ -500,16 +510,13 @@ void CanvasRenderer::CreatePermanentResources() _font = Renderer::Font::GetDefaultFont(_renderer); _renderer->AddTextureToArray(_font->GetTextureID(), _fontTextures); - _textDescriptorSet.Bind("_fontTextures"_h, _fontTextures); + _widgetDescriptorSet.Bind("_fontTextures"_h, _fontTextures); _vertices.SetDebugName("UIVertices"); _vertices.SetUsage(Renderer::BufferUsage::STORAGE_BUFFER); - _panelDrawDatas.SetDebugName("PanelDrawDatas"); - _panelDrawDatas.SetUsage(Renderer::BufferUsage::STORAGE_BUFFER); - - _charDrawDatas.SetDebugName("CharDrawDatas"); - _charDrawDatas.SetUsage(Renderer::BufferUsage::STORAGE_BUFFER); + _widgetDrawDatas.SetDebugName("WidgetDrawDatas"); + _widgetDrawDatas.SetUsage(Renderer::BufferUsage::STORAGE_BUFFER); _widgetWorldPositions.SetDebugName("WidgetWorldPositions"); _widgetWorldPositions.SetUsage(Renderer::BufferUsage::STORAGE_BUFFER); @@ -520,73 +527,40 @@ void CanvasRenderer::CreatePermanentResources() void CanvasRenderer::CreatePipelines() { - // Create pipelines + // Create the merged Widget pipeline Renderer::ImageFormat renderTargetFormat = _renderer->GetSwapChainImageFormat(); - { - Renderer::GraphicsPipelineDesc pipelineDesc; - - // Rasterizer state - pipelineDesc.states.rasterizerState.cullMode = Renderer::CullMode::BACK; + Renderer::GraphicsPipelineDesc pipelineDesc; - // Render targets. - pipelineDesc.states.renderTargetFormats[0] = renderTargetFormat; + // Rasterizer state + pipelineDesc.states.rasterizerState.cullMode = Renderer::CullMode::BACK; - // Shader - Renderer::VertexShaderDesc vertexShaderDesc; - vertexShaderDesc.shaderEntry = _gameRenderer->GetShaderEntry("UI/Panel.vs"_h, "UI/Panel.vs"); - pipelineDesc.states.vertexShader = _renderer->LoadShader(vertexShaderDesc); + // Render targets. + pipelineDesc.states.renderTargetFormats[0] = renderTargetFormat; - Renderer::PixelShaderDesc pixelShaderDesc; - pixelShaderDesc.shaderEntry = _gameRenderer->GetShaderEntry("UI/Panel.ps"_h, "UI/Panel.ps"); - pipelineDesc.states.pixelShader = _renderer->LoadShader(pixelShaderDesc); + // Shader + Renderer::VertexShaderDesc vertexShaderDesc; + vertexShaderDesc.shaderEntry = _gameRenderer->GetShaderEntry("UI/Widget.vs"_h, "UI/Widget.vs"); + pipelineDesc.states.vertexShader = _renderer->LoadShader(vertexShaderDesc); - // Blending - pipelineDesc.states.blendState.renderTargets[0].blendEnable = true; - pipelineDesc.states.blendState.renderTargets[0].srcBlend = Renderer::BlendMode::SRC_ALPHA; - pipelineDesc.states.blendState.renderTargets[0].destBlend = Renderer::BlendMode::INV_SRC_ALPHA; - pipelineDesc.states.blendState.renderTargets[0].srcBlendAlpha = Renderer::BlendMode::ONE; - pipelineDesc.states.blendState.renderTargets[0].destBlendAlpha = Renderer::BlendMode::INV_SRC_ALPHA; + Renderer::PixelShaderDesc pixelShaderDesc; + pixelShaderDesc.shaderEntry = _gameRenderer->GetShaderEntry("UI/Widget.ps"_h, "UI/Widget.ps"); + pipelineDesc.states.pixelShader = _renderer->LoadShader(pixelShaderDesc); - _panelPipeline = _renderer->CreatePipeline(pipelineDesc); - } - - { - Renderer::GraphicsPipelineDesc pipelineDesc; - - // Rasterizer state - pipelineDesc.states.rasterizerState.cullMode = Renderer::CullMode::BACK; - - // Render targets. - pipelineDesc.states.renderTargetFormats[0] = renderTargetFormat; - - // Shader - Renderer::VertexShaderDesc vertexShaderDesc; - vertexShaderDesc.shaderEntry = _gameRenderer->GetShaderEntry("UI/Text.vs"_h, "UI/Text.vs"); - pipelineDesc.states.vertexShader = _renderer->LoadShader(vertexShaderDesc); + // Blending + pipelineDesc.states.blendState.renderTargets[0].blendEnable = true; + pipelineDesc.states.blendState.renderTargets[0].srcBlend = Renderer::BlendMode::SRC_ALPHA; + pipelineDesc.states.blendState.renderTargets[0].destBlend = Renderer::BlendMode::INV_SRC_ALPHA; + pipelineDesc.states.blendState.renderTargets[0].srcBlendAlpha = Renderer::BlendMode::ONE; + pipelineDesc.states.blendState.renderTargets[0].destBlendAlpha = Renderer::BlendMode::INV_SRC_ALPHA; - Renderer::PixelShaderDesc pixelShaderDesc; - pixelShaderDesc.shaderEntry = _gameRenderer->GetShaderEntry("UI/Text.ps"_h, "UI/Text.ps"); - pipelineDesc.states.pixelShader = _renderer->LoadShader(pixelShaderDesc); - - // Blending - pipelineDesc.states.blendState.renderTargets[0].blendEnable = true; - pipelineDesc.states.blendState.renderTargets[0].srcBlend = Renderer::BlendMode::SRC_ALPHA; - pipelineDesc.states.blendState.renderTargets[0].destBlend = Renderer::BlendMode::INV_SRC_ALPHA; - pipelineDesc.states.blendState.renderTargets[0].srcBlendAlpha = Renderer::BlendMode::ONE; - pipelineDesc.states.blendState.renderTargets[0].destBlendAlpha = Renderer::BlendMode::INV_SRC_ALPHA; - - _textPipeline = _renderer->CreatePipeline(pipelineDesc); - } + _widgetPipeline = _renderer->CreatePipeline(pipelineDesc); } void CanvasRenderer::InitDescriptorSets() { - _panelDescriptorSet.RegisterPipeline(_renderer, _panelPipeline); - _panelDescriptorSet.Init(_renderer); - _textDescriptorSet.RegisterPipeline(_renderer, _textPipeline); - _textDescriptorSet.Init(_renderer); - + _widgetDescriptorSet.RegisterPipeline(_renderer, _widgetPipeline); + _widgetDescriptorSet.Init(_renderer); } void CanvasRenderer::UpdatePanelVertices(const vec2& clipPos, const vec2& clipSize, ECS::Components::UI::Panel& panel, ECS::Components::UI::PanelTemplate& panelTemplate) @@ -789,14 +763,16 @@ void CanvasRenderer::UpdatePanelData(entt::entity entity, ECS::Components::Trans // Add draw data if necessary if (panel.gpuDataIndex == -1) { - panel.gpuDataIndex = _panelDrawDatas.Add(); + panel.gpuDataIndex = _widgetDrawDatas.Add(); } vec2 size = transform.GetSize(); - vec2 cornerRadius = vec2(panelTemplate.cornerRadius / size.x, panelTemplate.cornerRadius /size.y); + vec2 cornerRadius = vec2(panelTemplate.cornerRadius / size.x, panelTemplate.cornerRadius / size.y); // Update draw data - auto& drawData = _panelDrawDatas[panel.gpuDataIndex]; - drawData.packed0.z = panelTemplate.color.ToRGBA32(); + auto& drawData = _widgetDrawDatas[panel.gpuDataIndex]; + drawData.packed0.x = static_cast(WidgetDrawType::Panel); + drawData.packed0.y = static_cast(panel.gpuVertexIndex); // vertexBase + drawData.packed1.z = panelTemplate.color.ToRGBA32(); drawData.cornerRadiusAndBorder = vec4(cornerRadius, 0.0f, 0.0f); // Update textures @@ -823,7 +799,7 @@ void CanvasRenderer::UpdatePanelData(entt::entity entity, ECS::Components::Trans additiveTextureIndex = LoadTexture(panelTemplate.foreground); } - drawData.packed0.x = (textureIndex & 0xFFFF) | ((additiveTextureIndex & 0xFFFF) << 16); + drawData.packed1.x = (textureIndex & 0xFFFF) | ((additiveTextureIndex & 0xFFFF) << 16); // Nine slicing const vec2& widgetSize = transform.GetSize(); @@ -833,7 +809,9 @@ void CanvasRenderer::UpdatePanelData(entt::entity entity, ECS::Components::Trans vec2 texSize = vec2(textureBaseDesc.width, textureBaseDesc.height); vec2 textureScaleToWidgetSize = texSize / widgetSize; - drawData.textureScaleToWidgetSize = hvec2(textureScaleToWidgetSize.x, textureScaleToWidgetSize.y); + hvec2 packedScale = hvec2(textureScaleToWidgetSize.x, textureScaleToWidgetSize.y); + static_assert(sizeof(hvec2) == sizeof(u32), "hvec2 must be 4 bytes for packed storage"); + std::memcpy(&drawData.packed1.w, &packedScale, sizeof(u32)); drawData.texCoord = vec4(panelTemplate.texCoords.min, panelTemplate.texCoords.max); drawData.slicingCoord = vec4(panelTemplate.nineSliceCoords.min, panelTemplate.nineSliceCoords.max); @@ -843,7 +821,7 @@ void CanvasRenderer::UpdatePanelData(entt::entity entity, ECS::Components::Trans // Get the correct clipper auto* clipper = ®istry->get(entity); BoundingRect* boundingRect = ®istry->get(entity); - + vec2 referenceSize = vec2(Renderer::Settings::UI_REFERENCE_WIDTH, Renderer::Settings::UI_REFERENCE_HEIGHT); vec2 clipRegionMin = clipper->clipRegionMin; vec2 clipRegionMax = clipper->clipRegionMax; @@ -859,16 +837,16 @@ void CanvasRenderer::UpdatePanelData(entt::entity entity, ECS::Components::Trans vec2 scaledClipMaskRegionMin = boundingRect->min / referenceSize; vec2 scaledClipMaskRegionMax = boundingRect->max / referenceSize; - - drawData.packed0.y = (clipper->hasClipMaskTexture) ? LoadTexture(clipper->clipMaskTexture) : 0; - drawData.clipRegionRect = vec4(clipRegionMin, clipRegionMax); - drawData.clipMaskRegionRect = vec4(scaledClipMaskRegionMin, scaledClipMaskRegionMax); - // World position UI + drawData.packed0.z = (clipper->hasClipMaskTexture) ? LoadTexture(clipper->clipMaskTexture) : 0; + drawData.clipRegionRect = hvec4(clipRegionMin.x, clipRegionMin.y, clipRegionMax.x, clipRegionMax.y); + drawData.clipMaskRegionRect = hvec4(scaledClipMaskRegionMin.x, scaledClipMaskRegionMin.y, scaledClipMaskRegionMax.x, scaledClipMaskRegionMax.y); + + // World position UI (UINT_MAX bit-pattern == -1 when reinterpreted as int in the shader) auto& widget = registry->get(entity); - drawData.worldPositionIndex = widget.worldTransformIndex; + drawData.packed0.w = widget.worldTransformIndex; - _panelDrawDatas.SetDirtyElement(panel.gpuDataIndex); + _widgetDrawDatas.SetDirtyElement(panel.gpuDataIndex); } void CanvasRenderer::UpdateTextData(entt::entity entity, Text& text, ECS::Components::UI::TextTemplate& textTemplate) @@ -903,10 +881,10 @@ void CanvasRenderer::UpdateTextData(entt::entity entity, Text& text, ECS::Compon // Add or update draw data if necessary if (text.gpuDataIndex == -1 || text.hasGrown) { - text.gpuDataIndex = _charDrawDatas.AddCount(text.numCharsNonWhitespace); + text.gpuDataIndex = _widgetDrawDatas.AddCount(text.numCharsNonWhitespace); } - // Update CharDrawData + // Update WidgetDrawData entries (one per non-whitespace char) Renderer::Font* font = Renderer::Font::GetFont(_renderer, textTemplate.font); Renderer::TextureID fontTextureID = font->GetTextureID(); @@ -964,70 +942,32 @@ void CanvasRenderer::UpdateTextData(entt::entity entity, Text& text, ECS::Compon continue; } - auto& drawData = _charDrawDatas[text.gpuDataIndex + charIndex]; - drawData.packed0.x = (fontTextureIndex & 0xFFFF) | ((charIndex & 0xFFFF) << 16); - drawData.packed0.z = textTemplate.color.ToRGBA32(); - drawData.packed0.w = textTemplate.borderColor.ToRGBA32(); - - drawData.packed1.x = textTemplate.borderSize; + auto& drawData = _widgetDrawDatas[text.gpuDataIndex + charIndex]; + drawData.packed0.x = static_cast(WidgetDrawType::Text); + drawData.packed0.y = static_cast(text.gpuVertexIndex) + (charIndex * 6); // vertexBase + drawData.packed1.x = (fontTextureIndex & 0xFFFF); + drawData.packed1.z = textTemplate.color.ToRGBA32(); + drawData.packed1.w = textTemplate.borderColor.ToRGBA32(); - // Unit range + // borderSize in cornerRadiusAndBorder.x; unitRange in .zw f32 distanceRange = font->upperPixelRange - font->lowerPixelRange; - drawData.packed1.z = distanceRange / font->width; - drawData.packed1.w = distanceRange / font->height; + drawData.cornerRadiusAndBorder.x = textTemplate.borderSize; + drawData.cornerRadiusAndBorder.y = 0.0f; + drawData.cornerRadiusAndBorder.z = distanceRange / font->width; + drawData.cornerRadiusAndBorder.w = distanceRange / font->height; // Clipping - drawData.packed0.y = (clipper->hasClipMaskTexture) ? LoadTexture(clipper->clipMaskTexture) : 0; - drawData.clipRegionRect = vec4(clipRegionMin, clipRegionMax); - drawData.clipMaskRegionRect = vec4(scaledClipMaskRegionMin, scaledClipMaskRegionMax); + drawData.packed0.z = (clipper->hasClipMaskTexture) ? LoadTexture(clipper->clipMaskTexture) : 0; + drawData.clipRegionRect = hvec4(clipRegionMin.x, clipRegionMin.y, clipRegionMax.x, clipRegionMax.y); + drawData.clipMaskRegionRect = hvec4(scaledClipMaskRegionMin.x, scaledClipMaskRegionMin.y, scaledClipMaskRegionMax.x, scaledClipMaskRegionMax.y); - // World position UI + // World position UI (UINT_MAX bit-pattern == -1 when reinterpreted as int in the shader) auto& widget = registry->get(entity); - drawData.worldPositionIndex = widget.worldTransformIndex; + drawData.packed0.w = widget.worldTransformIndex; charIndex++; } - _charDrawDatas.SetDirtyElements(text.gpuDataIndex, text.numCharsNonWhitespace); -} - -bool CanvasRenderer::ChangePipelineIfNecessary(Renderer::CommandList& commandList, Renderer::GraphicsPipelineID& currentPipeline, ECS::Components::UI::WidgetType widgetType) -{ - if (_lastRenderedWidgetType != widgetType) - { - if (_lastRenderedWidgetType != WidgetType::None) - { - commandList.EndPipeline(currentPipeline); - } - - _lastRenderedWidgetType = widgetType; - - if (widgetType == WidgetType::Panel) - { - currentPipeline = _panelPipeline; - } - else - { - currentPipeline = _textPipeline; - } - - commandList.BeginPipeline(currentPipeline); - return true; - } - return false; -} - -void CanvasRenderer::RenderPanel(Renderer::CommandList& commandList, ECS::Components::Transform2D& transform, Widget& widget, Panel& panel) -{ - commandList.PushMarker("Panel", Color::White); - commandList.Draw(6, 1, panel.gpuVertexIndex, panel.gpuDataIndex); - commandList.PopMarker(); -} - -void CanvasRenderer::RenderText(Renderer::CommandList& commandList, ECS::Components::Transform2D& transform, Widget& widget, Text& text) -{ - commandList.PushMarker("Text", Color::White); - commandList.Draw(6, text.numCharsNonWhitespace, text.gpuVertexIndex, text.gpuDataIndex); - commandList.PopMarker(); + _widgetDrawDatas.SetDirtyElements(text.gpuDataIndex, text.numCharsNonWhitespace); } vec2 CanvasRenderer::PixelPosToNDC(const vec2& pixelPosition, const vec2& screenSize) const @@ -1065,7 +1005,7 @@ u32 CanvasRenderer::LoadTexture(std::string_view path) // Use already loaded texture return _textureNameHashToIndex[textureNameHash]; } - + // Load texture Renderer::TextureDesc desc; desc.path = path; @@ -1076,3 +1016,228 @@ u32 CanvasRenderer::LoadTexture(std::string_view path) _textureNameHashToIndex[textureNameHash] = textureIndex; return textureIndex; } + +// ---------------------------------------------------------------------------- +// Sortkey layout (u32): +// MSB LSB +// [ priority 5 | canvasOrder 8 | traversalIndex 15 | reserved 4 ] +// +// - priority: 0 = normal, >0 = promoted (focus/drag/modal...). At the top so a dragged/focused +// widget floats above every normal widget, across canvases. +// - canvasOrder: 0 = bottom canvas, grows upward. Makes per-canvas sort a natural consequence +// of the sort key - no explicit grouping loop needed in the render pass. +// - traversalIndex: DFS pre-order index within a canvas, Z-sorted at each sibling level. +// Unique within the canvas. Encodes parent-before-child containment. Caps at 2^15-1 = 32,767 +// widgets per canvas; runaway counts are clamped so they never leak into canvasOrder bits. +// - reserved: for future use (clip bucket, atlas bucket, ...). +// ---------------------------------------------------------------------------- +u8 CanvasRenderer::ResolvePriority(entt::registry* registry, entt::entity entity) const +{ + auto& uiSingleton = registry->ctx().get(); + if (entity != entt::null && entity == uiSingleton.focusedEntity) + { + return 1; // Focus tier. Drag/modal/tooltip slots reserved for future systems. + } + return 0; +} + +void CanvasRenderer::RebuildCanvasOrder(entt::registry* registry) +{ + _canvasOrderByEntity.clear(); + + // Collect canvases + their layer. We iterate via registry->view so the natural + // ordering from entt is used as the tiebreaker when two canvases share a layer. + struct CanvasOrderEntry { entt::entity entity; u32 layer; u32 iterSeenIndex; }; + std::vector canvases; + + u32 iterSeenIndex = 0; + registry->view().each([&](entt::entity canvasEntity, Canvas&) + { + auto& transform = registry->get(canvasEntity); + canvases.push_back({ canvasEntity, transform.GetLayer(), iterSeenIndex++ }); + }); + + // Sort: layer asc, then iteration order asc. Unique keys by construction. + std::sort(canvases.begin(), canvases.end(), [](const CanvasOrderEntry& a, const CanvasOrderEntry& b) + { + if (a.layer != b.layer) + return a.layer < b.layer; + return a.iterSeenIndex < b.iterSeenIndex; + }); + + for (size_t i = 0; i < canvases.size(); ++i) + { + _canvasOrderByEntity[canvases[i].entity] = static_cast(std::min(i, 255)); + } +} + +void CanvasRenderer::DfsAssignSortKey(entt::registry* registry, entt::entity entity, u8 canvasOrder, u32& traversalIndex, u8 inheritedPriority) +{ + auto& transform2DSystem = ECS::Transform2DSystem::Get(*registry); + + auto& widget = registry->get(entity); + u8 effectivePriority = std::max(inheritedPriority, ResolvePriority(registry, entity)); + + // Canvases aren't drawn themselves, so we don't produce a sort key for them (they're iteration hubs). + if (widget.type != WidgetType::Canvas) + { + // u32 sortkey layout (MSB -> LSB): + // [ priority 5 | canvasOrder 8 | traversalIndex 15 | reserved 4 ] + // 32,768 widgets per canvas max; runaway scripts get clamped so the key never bleeds + // into the canvasOrder bits. In practice real UIs are well under 1000 widgets per canvas. + constexpr u32 kMaxTraversalIndex = (1u << 15) - 1; + const u32 clampedTraversal = std::min(traversalIndex, kMaxTraversalIndex); + widget.sortKey = (static_cast(effectivePriority) << 27) + | (static_cast(canvasOrder) << 19) + | (clampedTraversal << 4); + ++traversalIndex; + } + + // Gather children, sort by (Transform2D::layer asc, SceneNode2D::siblingIndex asc). Not stable_sort - + // the siblingIndex tiebreaker already guarantees a total order. + // + // Recursion-safe via stack discipline on the shared _siblingScratch: + // - record `start` before pushing this level's children, + // - sort only [start, end), + // - copy each child by VALUE before recursing (the recursive call will push more entries + // and may reallocate the underlying buffer; the by-value copy is unaffected), + // - resize back to `start` before returning so the caller's frame is intact. + const size_t start = _siblingScratch.size(); + transform2DSystem.IterateChildren(entity, [&](entt::entity childEntity) + { + _siblingScratch.push_back(childEntity); + }); + const size_t count = _siblingScratch.size() - start; + + std::sort(_siblingScratch.begin() + start, _siblingScratch.end(), [&](entt::entity a, entt::entity b) + { + const auto& ta = registry->get(a); + const auto& tb = registry->get(b); + if (ta.GetLayer() != tb.GetLayer()) + return ta.GetLayer() < tb.GetLayer(); + + const auto& na = registry->get(a); + const auto& nb = registry->get(b); + return na.GetSiblingIndex() < nb.GetSiblingIndex(); + }); + + for (size_t i = 0; i < count; ++i) + { + // By-value copy is mandatory: the recursive call will push to _siblingScratch and may + // reallocate the backing buffer, invalidating any reference into it. + const entt::entity child = _siblingScratch[start + i]; + DfsAssignSortKey(registry, child, canvasOrder, traversalIndex, effectivePriority); + } + + _siblingScratch.resize(start); +} + +void CanvasRenderer::RefreshBucketCPU(entt::registry* registry, entt::entity canvasEntity, bool isRT) +{ + ECS::Transform2DSystem& transformSystem2D = ECS::Transform2DSystem::Get(*registry); + + // Resolve target bucket (insert empty on first encounter for this RT canvas). + BucketResources* bucket = isRT ? &_rtBuckets.try_emplace(canvasEntity).first->second + : &_mainBucket; + + // --- Gather (sortKey, IndirectDraw) pairs into _sortScratch ------------------------------- + _sortScratch.clear(); + + auto gather = [&](entt::entity root) + { + transformSystem2D.IterateChildrenRecursiveDepth(root, [&](entt::entity childEntity) + { + auto& w = registry->get(childEntity); + if (!w.IsVisible()) + return false; + if (w.type != WidgetType::Panel && w.type != WidgetType::Text) + return true; + + Renderer::IndirectDraw args{}; + args.vertexCount = 6; + args.firstVertex = 0; + + if (w.type == WidgetType::Panel) + { + auto& panel = registry->get(childEntity); + if (panel.gpuDataIndex < 0) + return true; + args.instanceCount = 1; + args.firstInstance = static_cast(panel.gpuDataIndex); + } + else // Text + { + auto& text = registry->get(childEntity); + if (text.numCharsNonWhitespace <= 0 || text.gpuDataIndex < 0) + return true; + args.instanceCount = static_cast(text.numCharsNonWhitespace); + args.firstInstance = static_cast(text.gpuDataIndex); + } + + _sortScratch.push_back({ w.sortKey, args }); + return true; + }); + }; + + if (isRT) + { + gather(canvasEntity); + } + else + { + registry->view(entt::exclude).each([&](entt::entity c, Canvas&) + { + gather(c); + }); + } + + const u32 drawCount = static_cast(_sortScratch.size()); + bucket->drawCount = drawCount; + + if (drawCount == 0) + { + // Nothing to draw. Leave retained buffers as-is; the draw pass checks drawCount==0 and skips. + return; + } + + // --- Sort on CPU --------------------------------------------------------------------------- + // std::sort beats our GPU radix sort at UI scale (N up to a few thousand) because the GPU pipe + // is dispatch-overhead-bound regardless of N. See git history for the GPU path (RadixSort.*). + std::sort(_sortScratch.begin(), _sortScratch.end(), [](const SortEntry& a, const SortEntry& b) + { + return a.key < b.key; + }); + + // --- Extract sorted IndirectDraws into contiguous upload vector ---------------------------- + _uploadScratch.clear(); + _uploadScratch.reserve(drawCount); + for (const SortEntry& e : _sortScratch) + _uploadScratch.push_back(e.draw); + + // --- (Re)create retained finalSortedArgs / finalCount if needed ---------------------------- + if (bucket->finalSortedArgsCapacity < drawCount || bucket->finalSortedArgs == Renderer::BufferID::Invalid()) + { + Renderer::BufferDesc argsDesc; + argsDesc.name = isRT ? "UISort.RT.FinalSortedArgs" : "UISort.Main.FinalSortedArgs"; + argsDesc.usage = Renderer::BufferUsage::INDIRECT_ARGUMENT_BUFFER + | Renderer::BufferUsage::TRANSFER_DESTINATION; + argsDesc.size = static_cast(drawCount) * sizeof(Renderer::IndirectDraw); + bucket->finalSortedArgs = _renderer->CreateBuffer(bucket->finalSortedArgs, argsDesc); + bucket->finalSortedArgsCapacity = drawCount; + } + if (bucket->finalCount == Renderer::BufferID::Invalid()) + { + Renderer::BufferDesc countDesc; + countDesc.name = isRT ? "UISort.RT.FinalCount" : "UISort.Main.FinalCount"; + countDesc.usage = Renderer::BufferUsage::INDIRECT_ARGUMENT_BUFFER + | Renderer::BufferUsage::TRANSFER_DESTINATION; + countDesc.size = sizeof(u32); + bucket->finalCount = _renderer->CreateBuffer(countDesc); + } + + // --- Upload -------------------------------------------------------------------------------- + // UploadToBuffer queues a staged copy that completes before the next frame's command list + // runs. Same mechanism we already use for everything else here. + _renderer->UploadToBuffer(bucket->finalSortedArgs, 0, _uploadScratch.data(), 0, static_cast(drawCount) * sizeof(Renderer::IndirectDraw)); + _renderer->UploadToBuffer(bucket->finalCount, 0, &bucket->drawCount, 0, sizeof(u32)); +} diff --git a/Source/Game-Lib/Game-Lib/Rendering/Canvas/CanvasRenderer.h b/Source/Game-Lib/Game-Lib/Rendering/Canvas/CanvasRenderer.h index ea0b5bc4..0dd718aa 100644 --- a/Source/Game-Lib/Game-Lib/Rendering/Canvas/CanvasRenderer.h +++ b/Source/Game-Lib/Game-Lib/Rendering/Canvas/CanvasRenderer.h @@ -8,9 +8,14 @@ #include #include #include +#include + +#include #include +#include + namespace Renderer { class RenderGraph; @@ -65,42 +70,51 @@ class CanvasRenderer void UpdatePanelData(entt::entity entity, ECS::Components::Transform2D& transform, ECS::Components::UI::Panel& panel, ECS::Components::UI::PanelTemplate& panelTemplate); void UpdateTextData(entt::entity entity, ECS::Components::UI::Text& text, ECS::Components::UI::TextTemplate& textTemplate); - bool ChangePipelineIfNecessary(Renderer::CommandList& commandList, Renderer::GraphicsPipelineID& currentPipeline, ECS::Components::UI::WidgetType widgetType); - void RenderPanel(Renderer::CommandList& commandList, ECS::Components::Transform2D& transform, ECS::Components::UI::Widget& widget, ECS::Components::UI::Panel& panel); - void RenderText(Renderer::CommandList& commandList, ECS::Components::Transform2D& transform, ECS::Components::UI::Widget& widget, ECS::Components::UI::Text& text); - vec2 PixelPosToNDC(const vec2& pixelPosition, const vec2& screenSize) const; vec2 PixelSizeToNDC(const vec2& pixelPosition, const vec2& screenSize) const; u32 AddTexture(Renderer::TextureID textureID); u32 LoadTexture(std::string_view path); - -private: - struct PanelDrawData + // --- Sortkey machinery (see CanvasRenderer.cpp for bit layout) ----------------- + // Resolves the effective priority for this widget (0 = normal, >0 = promoted for focus/drag/etc). + u8 ResolvePriority(entt::registry* registry, entt::entity entity) const; + + // Rebuilds the mapping from canvas entity to its 8-bit canvasOrder, based on canvas layer + registry iteration. + void RebuildCanvasOrder(entt::registry* registry); + + // Walks a canvas's subtree depth-first, writing sortKey to each Widget component. Siblings are sorted by + // (Transform2D::layer asc, SceneNode2D::siblingIndex asc) before recursion so the order is deterministic + // and every produced sortKey is unique. + void DfsAssignSortKey(entt::registry* registry, entt::entity entity, u8 canvasOrder, u32& traversalIndex, u8 inheritedPriority); + + // Gather + sort + upload for one render-pass bucket. Walks the canvas subtree(s), filters + // visible Panel/Text entries into _sortScratch, std::sorts by sortKey, copies the sorted + // IndirectDraws into _uploadScratch, and queues a CPU->GPU upload to the bucket's retained + // finalSortedArgs + finalCount. + // + // canvasEntity == entt::null signals "main bucket" (every non-RT canvas merged). + void RefreshBucketCPU(entt::registry* registry, entt::entity canvasEntity, bool isRT); + + +public: + enum class WidgetDrawType : u32 { - public: - uvec3 packed0; // x: textureIndex & additiveTextureIndex, y: clipMaskTextureIndex, z: color - hvec2 textureScaleToWidgetSize = hvec2(0.0f, 0.0f); - vec4 texCoord; // uv - vec4 slicingCoord; // uv - //vec4 color; // xyz: color, w: unused - vec4 cornerRadiusAndBorder; // xy: cornerRadius, zw: border - hvec4 clipRegionRect = hvec4(0.0f, 0.0f, 1.0f, 1.0f); // xy: min, zw: max - hvec4 clipMaskRegionRect = hvec4(0.0f, 0.0f, 1.0f, 1.0f); // xy: min, zw: max - i32 worldPositionIndex = -1; - i32 padding[3]; + Panel = 0, + Text = 1, }; - struct CharDrawData +private: + struct WidgetDrawData { public: - uvec4 packed0; // x: textureIndex & clipMaskTextureIndex, y: charIndex, z: textColor, w: borderColor - vec4 packed1; // x: borderSize, y: padding, zw: unitRangeXY - hvec4 clipRegionRect = hvec4(0.0f, 0.0f, 1.0f, 1.0f); // xy: min, zw: max + uvec4 packed0 = uvec4(0, 0, 0, 0xFFFFFFFFu); // x: type, y: vertexBase, z: clipMaskTextureIndex, w: worldPositionIndex (i32 reinterpret as -1) + uvec4 packed1 = uvec4(0, 0, 0, 0); // Panel: x: textureIndex|additiveTextureIndex, z: color, w: textureScaleToWidgetSize (half2). Text: x: fontTextureIndex, z: textColor, w: borderColor + vec4 texCoord = vec4(0.0f); // Panel only + vec4 slicingCoord = vec4(0.0f); // Panel only + vec4 cornerRadiusAndBorder = vec4(0.0f); // Panel: xy: cornerRadius. Text: x: borderSize, zw: unitRange + hvec4 clipRegionRect = hvec4(0.0f, 0.0f, 1.0f, 1.0f); // xy: min, zw: max hvec4 clipMaskRegionRect = hvec4(0.0f, 0.0f, 1.0f, 1.0f); // xy: min, zw: max - i32 worldPositionIndex; - i32 padding[3]; }; private: @@ -109,12 +123,10 @@ class CanvasRenderer DebugRenderer* _debugRenderer; Renderer::GPUVector _vertices; - Renderer::GPUVector _panelDrawDatas; - - Renderer::GPUVector _charDrawDatas; + Renderer::GPUVector _widgetDrawDatas; Renderer::GPUVector _widgetWorldPositions; - + Renderer::Font* _font; Renderer::SamplerID _sampler; Renderer::TextureArrayID _textures; @@ -124,11 +136,42 @@ class CanvasRenderer Renderer::TextureArrayID _fontTextures; robin_hood::unordered_map _textureIDToFontTexturesIndex; - Renderer::GraphicsPipelineID _panelPipeline; - Renderer::GraphicsPipelineID _textPipeline; + Renderer::GraphicsPipelineID _widgetPipeline; + + Renderer::DescriptorSet _widgetDescriptorSet; + + // --- Sortkey state ------------------------------------------------------------ + // Assigned canvasOrder (0..255) per canvas entity, refreshed by RebuildCanvasOrder when + // the canvas SET changes (gated on DirtyCanvasOrderFlag). Read by DfsAssignSortKey to bake + // canvasOrder into each widget's sortKey. + robin_hood::unordered_map _canvasOrderByEntity; + + // Shared scratch for DfsAssignSortKey. Each recursion level appends its children's entities + // to the tail, sorts only its own [start, end) range, recurses by-value, and resizes back + // to its start before returning. Net: zero allocations after the first warmup. + std::vector _siblingScratch; + + // --- Per-bucket retained indirect-draw state ---------------------------------- + // One BucketResources per render-pass bucket: one per RT canvas that has ever existed, + // plus one static _mainBucket for all non-RT canvases merged together. finalSortedArgs + // is retained across frames; it's CPU-sorted and uploaded only when the bucket is dirty, + // and consumed as-is by DrawIndirectCount every frame. + struct BucketResources + { + Renderer::BufferID finalSortedArgs = Renderer::BufferID::Invalid(); + u32 finalSortedArgsCapacity = 0; + u32 drawCount = 0; + + // Single-element u32 count buffer for DrawIndirectCount. + Renderer::BufferID finalCount = Renderer::BufferID::Invalid(); + }; - Renderer::DescriptorSet _panelDescriptorSet; - Renderer::DescriptorSet _textDescriptorSet; + robin_hood::unordered_map _rtBuckets; // key: RT canvas entity + BucketResources _mainBucket; - ECS::Components::UI::WidgetType _lastRenderedWidgetType = ECS::Components::UI::WidgetType::None; + // CPU scratch for gather+sort+upload inside RefreshBucketCPU. Reused across refreshes; + // `.clear()` preserves capacity. + struct SortEntry { u32 key; Renderer::IndirectDraw draw; }; + std::vector _sortScratch; + std::vector _uploadScratch; }; \ No newline at end of file diff --git a/Source/Game-Lib/Game-Lib/Rendering/Util/RadixSort.cpp b/Source/Game-Lib/Game-Lib/Rendering/Util/RadixSort.cpp new file mode 100644 index 00000000..9afe8903 --- /dev/null +++ b/Source/Game-Lib/Game-Lib/Rendering/Util/RadixSort.cpp @@ -0,0 +1,294 @@ +#include "RadixSort.h" + +#include "Game-Lib/Rendering/GameRenderer.h" + +#include +#include +#include + +#include + +// Mirror the shader-side constants in Source/Shaders/Shaders/Sorting/Radix/Constants.inc.slang. +static constexpr u32 RADIX = 256; +static constexpr u32 WORKGROUP_SIZE = 512; +static constexpr u32 PARTITION_DIVISION = 8; +static constexpr u32 PARTITION_SIZE = PARTITION_DIVISION * WORKGROUP_SIZE; // 4096 +static constexpr u32 NUM_RADIX_PASSES = 4; // 4 passes * 8 bits = 32-bit key + +RadixSort::RadixSort() + : _upsweepFromPingSet(Renderer::DescriptorSetSlot::PER_PASS) + , _upsweepFromPongSet(Renderer::DescriptorSetSlot::PER_PASS) + , _spineSet(Renderer::DescriptorSetSlot::PER_PASS) + , _downsweepPingToPongSet(Renderer::DescriptorSetSlot::PER_PASS) + , _downsweepPongToPingSet(Renderer::DescriptorSetSlot::PER_PASS) +{ +} + +void RadixSort::Init(Renderer::Renderer* renderer, GameRenderer* gameRenderer, u32 maxKeyCount) +{ + _renderer = renderer; + _gameRenderer = gameRenderer; + + CreatePipelines(); + AllocateFixedScratch(); + + _upsweepFromPingSet.RegisterPipeline(_renderer, _upsweepPipeline); + _upsweepFromPingSet.Init(_renderer); + _upsweepFromPongSet.RegisterPipeline(_renderer, _upsweepPipeline); + _upsweepFromPongSet.Init(_renderer); + + _spineSet.RegisterPipeline(_renderer, _spinePipeline); + _spineSet.Init(_renderer); + + _downsweepPingToPongSet.RegisterPipeline(_renderer, _downsweepPipeline); + _downsweepPingToPongSet.Init(_renderer); + _downsweepPongToPingSet.RegisterPipeline(_renderer, _downsweepPipeline); + _downsweepPongToPingSet.Init(_renderer); + + AllocateKeyCountScratch(maxKeyCount); + BindAllDescriptorSets(); + + _initialized = true; +} + +void RadixSort::CreatePipelines() +{ + // Upsweep + Spine: single permutation. + auto loadNoPermutation = [&](const char* shaderPath, Renderer::ComputePipelineID& out, const char* debugName) + { + Renderer::ComputeShaderDesc shaderDesc; + shaderDesc.shaderEntry = _gameRenderer->GetShaderEntry( + Renderer::GetShaderEntryNameHash(shaderPath, {}), + shaderPath); + + Renderer::ComputePipelineDesc pipelineDesc; + pipelineDesc.debugName = debugName; + pipelineDesc.computeShader = _renderer->LoadShader(shaderDesc); + out = _renderer->CreatePipeline(pipelineDesc); + }; + + loadNoPermutation("Sorting/Radix/Upsweep.cs", _upsweepPipeline, "RadixSort.Upsweep"); + loadNoPermutation("Sorting/Radix/Spine.cs", _spinePipeline, "RadixSort.Spine"); + + // Downsweep: compile with KEY_VALUE=1 permutation (we always sort key+value pairs). + { + std::vector permutation = { + { "KEY_VALUE", "1" } + }; + const char* shaderPath = "Sorting/Radix/Downsweep.cs"; + + Renderer::ComputeShaderDesc shaderDesc; + shaderDesc.shaderEntry = _gameRenderer->GetShaderEntry( + Renderer::GetShaderEntryNameHash(shaderPath, permutation), + shaderPath); + + Renderer::ComputePipelineDesc pipelineDesc; + pipelineDesc.debugName = "RadixSort.Downsweep.KV"; + pipelineDesc.computeShader = _renderer->LoadShader(shaderDesc); + _downsweepPipeline = _renderer->CreatePipeline(pipelineDesc); + } +} + +void RadixSort::AllocateFixedScratch() +{ + // globalHistogram: one u32[256] per radix pass = 4 * 256 * 4 bytes = 4 KiB. Zeroed by + // FillBuffer at the start of each sort (globalHistogram is accumulated then scanned per pass). + Renderer::BufferDesc desc; + desc.name = "RadixSort.GlobalHistogram"; + desc.usage = Renderer::BufferUsage::STORAGE_BUFFER | Renderer::BufferUsage::TRANSFER_DESTINATION; + desc.size = NUM_RADIX_PASSES * RADIX * sizeof(u32); + _globalHistogram = _renderer->CreateBuffer(desc); +} + +void RadixSort::AllocateKeyCountScratch(u32 newMaxKeyCount) +{ + const u8 sortScratchUsage = Renderer::BufferUsage::STORAGE_BUFFER + | Renderer::BufferUsage::TRANSFER_DESTINATION; + const u8 identityUsage = Renderer::BufferUsage::STORAGE_BUFFER + | Renderer::BufferUsage::TRANSFER_SOURCE; + + Renderer::BufferDesc desc; + desc.usage = sortScratchUsage; + desc.size = static_cast(newMaxKeyCount) * sizeof(u32); + + desc.name = "RadixSort.SortKeys"; + _sortKeys = _renderer->CreateBuffer(_sortKeys, desc); + + desc.name = "RadixSort.WriteKeys"; + _writeKeys = _renderer->CreateBuffer(_writeKeys, desc); + + desc.name = "RadixSort.SortValues"; + _sortValues = _renderer->CreateBuffer(_sortValues, desc); + + desc.name = "RadixSort.WriteValues"; + _writeValues = _renderer->CreateBuffer(_writeValues, desc); + + // partitionHistogram: u32[maxPartitions * 256]. Written by upsweep, scanned by spine, read by + // downsweep. No TRANSFER usage needed (never copied to/from). + const u32 maxPartitions = (newMaxKeyCount + PARTITION_SIZE - 1) / PARTITION_SIZE; + desc.name = "RadixSort.PartitionHistogram"; + desc.usage = Renderer::BufferUsage::STORAGE_BUFFER; + desc.size = static_cast(std::max(maxPartitions, 1u)) * RADIX * sizeof(u32); + _partitionHistogram = _renderer->CreateBuffer(_partitionHistogram, desc); + + // Identity values buffer: [0, 1, 2, ..., newMaxKeyCount-1]. CopyBuffer source only. + desc.name = "RadixSort.IdentityValues"; + desc.usage = identityUsage; + desc.size = static_cast(newMaxKeyCount) * sizeof(u32); + _identityValues = _renderer->CreateAndFillBuffer(_identityValues, desc, + [newMaxKeyCount](void* mapped, size_t) { + u32* p = static_cast(mapped); + for (u32 i = 0; i < newMaxKeyCount; ++i) + p[i] = i; + }); + + _maxKeyCount = newMaxKeyCount; +} + +void RadixSort::BindAllDescriptorSets() +{ + // Binding numbers match the shader's [[vk::binding(N, PER_PASS)]]. Binding 0 is not used -- + // we dropped the elementCounts buffer in favour of a push constant. + _upsweepFromPingSet.Bind("globalHistogram"_h, _globalHistogram); + _upsweepFromPingSet.Bind("partitionHistogram"_h, _partitionHistogram); + _upsweepFromPingSet.Bind("keys"_h, _sortKeys); + + _upsweepFromPongSet.Bind("globalHistogram"_h, _globalHistogram); + _upsweepFromPongSet.Bind("partitionHistogram"_h, _partitionHistogram); + _upsweepFromPongSet.Bind("keys"_h, _writeKeys); + + _spineSet.Bind("globalHistogram"_h, _globalHistogram); + _spineSet.Bind("partitionHistogram"_h, _partitionHistogram); + + // Downsweep ping->pong: read sortKeys/sortValues, write writeKeys/writeValues. + _downsweepPingToPongSet.Bind("globalHistogram"_h, _globalHistogram); + _downsweepPingToPongSet.Bind("partitionHistogram"_h, _partitionHistogram); + _downsweepPingToPongSet.Bind("keysIn"_h, _sortKeys); + _downsweepPingToPongSet.Bind("keysOut"_h, _writeKeys); + _downsweepPingToPongSet.Bind("valuesIn"_h, _sortValues); + _downsweepPingToPongSet.Bind("valuesOut"_h, _writeValues); + + // Downsweep pong->ping: the reverse. + _downsweepPongToPingSet.Bind("globalHistogram"_h, _globalHistogram); + _downsweepPongToPingSet.Bind("partitionHistogram"_h, _partitionHistogram); + _downsweepPongToPingSet.Bind("keysIn"_h, _writeKeys); + _downsweepPongToPingSet.Bind("keysOut"_h, _sortKeys); + _downsweepPongToPingSet.Bind("valuesIn"_h, _writeValues); + _downsweepPongToPingSet.Bind("valuesOut"_h, _sortValues); +} + +void RadixSort::EnsureCapacity(u32 requiredMaxKeyCount) +{ + if (!_initialized || requiredMaxKeyCount <= _maxKeyCount) + return; + + const u32 newCap = std::max(_maxKeyCount * 2, requiredMaxKeyCount); + + AllocateKeyCountScratch(newCap); + + // Rebind every descriptor set that references the resized buffers. Safe here because the + // caller must invoke EnsureCapacity CPU-side (outside any render-graph execute) -- the + // previous frame's command list either isn't submitted yet or has already released the old + // IDs (deferred-destroyed via QueueDestroyBuffer inside CreateBuffer(existing, desc)). + _upsweepFromPingSet.Bind("partitionHistogram"_h, _partitionHistogram); + _upsweepFromPingSet.Bind("keys"_h, _sortKeys); + + _upsweepFromPongSet.Bind("partitionHistogram"_h, _partitionHistogram); + _upsweepFromPongSet.Bind("keys"_h, _writeKeys); + + _spineSet.Bind("partitionHistogram"_h, _partitionHistogram); + + _downsweepPingToPongSet.Bind("partitionHistogram"_h, _partitionHistogram); + _downsweepPingToPongSet.Bind("keysIn"_h, _sortKeys); + _downsweepPingToPongSet.Bind("keysOut"_h, _writeKeys); + _downsweepPingToPongSet.Bind("valuesIn"_h, _sortValues); + _downsweepPingToPongSet.Bind("valuesOut"_h, _writeValues); + + _downsweepPongToPingSet.Bind("partitionHistogram"_h, _partitionHistogram); + _downsweepPongToPingSet.Bind("keysIn"_h, _writeKeys); + _downsweepPongToPingSet.Bind("keysOut"_h, _sortKeys); + _downsweepPongToPingSet.Bind("valuesIn"_h, _writeValues); + _downsweepPongToPingSet.Bind("valuesOut"_h, _sortValues); +} + +RadixSort::PassResources RadixSort::RegisterPass(Renderer::RenderGraphBuilder& builder) +{ + using BufferUsage = Renderer::BufferPassUsage; + + PassResources res; + + // sortKeys/sortValues are CopyBuffer destinations THEN compute read/write within the sort, + // so they need TRANSFER | COMPUTE. + res.sortKeys = builder.Write(_sortKeys, BufferUsage::TRANSFER | BufferUsage::COMPUTE); + res.sortValues = builder.Write(_sortValues, BufferUsage::TRANSFER | BufferUsage::COMPUTE); + builder.Write(_writeKeys, BufferUsage::COMPUTE); + builder.Write(_writeValues, BufferUsage::COMPUTE); + res.identityValues = builder.Read(_identityValues, BufferUsage::TRANSFER); + + // globalHistogram is zeroed via FillBuffer (TRANSFER) then read/written by compute. + res.globalHistogram = builder.Write(_globalHistogram, BufferUsage::TRANSFER | BufferUsage::COMPUTE); + builder.Write(_partitionHistogram, BufferUsage::COMPUTE); + + res.upsweepFromPing = builder.Use(_upsweepFromPingSet); + res.upsweepFromPong = builder.Use(_upsweepFromPongSet); + res.spine = builder.Use(_spineSet); + res.downsweepPingToPong = builder.Use(_downsweepPingToPongSet); + res.downsweepPongToPing = builder.Use(_downsweepPongToPingSet); + + return res; +} + +void RadixSort::RecordSort(Renderer::CommandList& commandList, u8 frameIndex, + const PassResources& passRes, u32 numKeys) +{ + if (numKeys == 0) + return; + + const u32 partitionCount = (numKeys + PARTITION_SIZE - 1) / PARTITION_SIZE; + + struct RadixPC { u32 pass; u32 elementCount; }; + + commandList.PushMarker("RadixSort", Color::Green); + + // Zero the 4 KiB global histogram before each sort. Spine writes a prefix-sum over the per- + // pass counts accumulated by upsweep into this buffer; we need a clean slate. + commandList.FillBuffer(passRes.globalHistogram, 0, NUM_RADIX_PASSES * RADIX * sizeof(u32), 0); + commandList.BufferBarrier(passRes.globalHistogram, Renderer::BufferPassUsage::TRANSFER); + + for (u32 pass = 0; pass < NUM_RADIX_PASSES; ++pass) + { + const bool fromPing = (pass & 1) == 0; + RadixPC pc{ pass, numKeys }; + + // Upsweep: builds per-partition histograms + global histogram for this pass. + commandList.BeginPipeline(_upsweepPipeline); + commandList.PushConstant(&pc, 0, sizeof(pc)); + commandList.BindDescriptorSet(fromPing ? passRes.upsweepFromPing : passRes.upsweepFromPong, frameIndex); + commandList.Dispatch(partitionCount, 1, 1); + commandList.EndPipeline(_upsweepPipeline); + + commandList.BufferBarrier(passRes.sortKeys, Renderer::BufferPassUsage::COMPUTE); + + // Spine: prefix-scan the per-partition histograms (one group per radix bin) + prefix-scan + // the global histogram for this pass (bin 0's group handles that). + commandList.BeginPipeline(_spinePipeline); + commandList.PushConstant(&pc, 0, sizeof(pc)); + commandList.BindDescriptorSet(passRes.spine, frameIndex); + commandList.Dispatch(RADIX, 1, 1); + commandList.EndPipeline(_spinePipeline); + + commandList.BufferBarrier(passRes.sortKeys, Renderer::BufferPassUsage::COMPUTE); + + // Downsweep: scatter keys and values to their globally sorted positions for this pass. + commandList.BeginPipeline(_downsweepPipeline); + commandList.PushConstant(&pc, 0, sizeof(pc)); + commandList.BindDescriptorSet(fromPing ? passRes.downsweepPingToPong : passRes.downsweepPongToPing, frameIndex); + commandList.Dispatch(partitionCount, 1, 1); + commandList.EndPipeline(_downsweepPipeline); + + if (pass + 1 < NUM_RADIX_PASSES) + commandList.BufferBarrier(passRes.sortKeys, Renderer::BufferPassUsage::COMPUTE); + } + + commandList.PopMarker(); +} diff --git a/Source/Game-Lib/Game-Lib/Rendering/Util/RadixSort.h b/Source/Game-Lib/Game-Lib/Rendering/Util/RadixSort.h new file mode 100644 index 00000000..7661771e --- /dev/null +++ b/Source/Game-Lib/Game-Lib/Rendering/Util/RadixSort.h @@ -0,0 +1,113 @@ +#pragma once +#include + +#include +#include +#include +#include +#include + +namespace Renderer +{ + class Renderer; + class RenderGraphBuilder; + class CommandList; +} + +class GameRenderer; + +// GPU u32 radix sort (reduce-then-scan), 8 bits per pass => 4 passes per u32 key, 3 dispatches per +// pass => 12 dispatches per sort. Port of https://github.com/jaesung-cs/vulkan_radix_sort. +// +// Architecture: a single descriptor-set family shared across every sort in the frame. Inputs are +// staged into `sortKeys`/`sortValues` via vkCmdCopyBuffer before each sort. All descriptor sets +// bind fixed scratch buffers, are bound once at Init, and never rebind at runtime -- keeps SSBO +// descriptor-pool cost flat regardless of how many sorts run per frame. +// +// Usage: +// 1. Call `RegisterPass(builder)` once inside a render-graph pass's onSetup lambda. +// 2. For each sort inside onExecute: +// a. CopyBuffer caller's keys -> passRes.sortKeys. +// b. CopyBuffer identityValues -> passRes.sortValues (seed payload = [0..N-1]). +// c. Barriers from TRANSFER to COMPUTE on sortKeys / sortValues. +// d. Call `RecordSort(cl, frameIndex, passRes, numKeys)`. +// After RecordSort returns, passRes.sortKeys/sortValues contain the sorted pairs in place +// (4 passes = even count, so the ping-pong lands back on the input buffers). +// +// Growth: `EnsureCapacity(N)` reallocates size-dependent scratch to at least N keys and rebinds +// the 5 sort descriptor sets. Must be called CPU-side (outside render-graph execution). +class RadixSort +{ +public: + struct PassResources + { + Renderer::DescriptorSetResource upsweepFromPing; + Renderer::DescriptorSetResource upsweepFromPong; + Renderer::DescriptorSetResource spine; + Renderer::DescriptorSetResource downsweepPingToPong; + Renderer::DescriptorSetResource downsweepPongToPing; + + // Shared-scratch handles the caller copies INTO before calling RecordSort. + Renderer::BufferMutableResource sortKeys; // u32[maxN] + Renderer::BufferMutableResource sortValues; // u32[maxN] + + // Source the caller copies FROM when seeding sortValues = [0..N-1]. + Renderer::BufferResource identityValues; // u32[maxN] + + // Internal scratch exposed only so RecordSort can FillBuffer / BufferBarrier it. + // Callers shouldn't touch this. + Renderer::BufferMutableResource globalHistogram; + }; + + RadixSort(); + + void Init(Renderer::Renderer* renderer, GameRenderer* gameRenderer, u32 maxKeyCount); + + PassResources RegisterPass(Renderer::RenderGraphBuilder& builder); + + void RecordSort(Renderer::CommandList& commandList, u8 frameIndex, + const PassResources& passRes, u32 numKeys); + + void EnsureCapacity(u32 requiredMaxKeyCount); + + Renderer::BufferID GetSortValuesBuffer() const { return _sortValues; } + u32 GetMaxKeyCount() const { return _maxKeyCount; } + +private: + void CreatePipelines(); + void AllocateFixedScratch(); // globalHistogram (fixed, 4 KiB) + void AllocateKeyCountScratch(u32 newMaxKeyCount); // sortKeys/sortValues/writeKeys/writeValues/identityValues/partitionHistogram + void BindAllDescriptorSets(); + +private: + Renderer::Renderer* _renderer = nullptr; + GameRenderer* _gameRenderer = nullptr; + + u32 _maxKeyCount = 0; + + // --- Pipelines (3) -------------------------------------------------------------------- + Renderer::ComputePipelineID _upsweepPipeline; + Renderer::ComputePipelineID _spinePipeline; + Renderer::ComputePipelineID _downsweepPipeline; // compiled with KEY_VALUE=1 permutation + + // --- 5 shared descriptor sets (bound once, rebound only by EnsureCapacity) ------------ + Renderer::DescriptorSet _upsweepFromPingSet; + Renderer::DescriptorSet _upsweepFromPongSet; + Renderer::DescriptorSet _spineSet; + Renderer::DescriptorSet _downsweepPingToPongSet; + Renderer::DescriptorSet _downsweepPongToPingSet; + + // --- Scratch buffers ------------------------------------------------------------------ + // Size-dependent on _maxKeyCount (resized by EnsureCapacity): + Renderer::BufferID _sortKeys = Renderer::BufferID::Invalid(); // u32[maxN] + Renderer::BufferID _sortValues = Renderer::BufferID::Invalid(); // u32[maxN] + Renderer::BufferID _writeKeys = Renderer::BufferID::Invalid(); // u32[maxN] ping-pong + Renderer::BufferID _writeValues = Renderer::BufferID::Invalid(); // u32[maxN] ping-pong + Renderer::BufferID _identityValues = Renderer::BufferID::Invalid(); // u32[maxN] = [0..maxN-1] + Renderer::BufferID _partitionHistogram = Renderer::BufferID::Invalid(); // u32[maxPartitions * 256] + + // Fixed-size scratch (never resized): + Renderer::BufferID _globalHistogram; // u32[4 * 256] = 4 KiB + + bool _initialized = false; +}; diff --git a/Source/Resources/Scripts/API/Input/Input.luau b/Source/Resources/Scripts/API/Input/Input.luau index c2cf01a8..f47052a6 100644 --- a/Source/Resources/Scripts/API/Input/Input.luau +++ b/Source/Resources/Scripts/API/Input/Input.luau @@ -80,7 +80,7 @@ inputTableAPI.keyNames = [284] = "PAUSE", [290] = "F1", [291] = "F2", - [292] = "F2", + [292] = "F3", [293] = "F4", [294] = "F5", [295] = "F6", diff --git a/Source/Resources/Scripts/UI/Demo.luau b/Source/Resources/Scripts/UI/Demo.luau index d2d18fea..8b625061 100644 --- a/Source/Resources/Scripts/UI/Demo.luau +++ b/Source/Resources/Scripts/UI/Demo.luau @@ -63,16 +63,16 @@ local function CreateMultiLineDemo(rtCanvas : Canvas) -- Create a panel local panel = canvas:NewPanel(0, 0, 400, 400, 0, "DialogBox"); - --panel:SetAnchor(0.5, 0.5); - --panel:SetRelativePoint(0.5, 0.5); + panel:SetAnchor(0.5, 0.5); + panel:SetRelativePoint(0.5, 0.5); - panel:SetAnchor(0.0, 0.0); - panel:SetRelativePoint(1.0, 0.0); + --panel:SetAnchor(0.0, 0.0); + --panel:SetRelativePoint(1.0, 0.0); --panel:DebugSetWorldTransformIndex(0); local middlePanel = panel:NewPanel(0, 0, 50, 50, 0, "DialogBox"); - middlePanel:SetAnchor(0.0, 0.0); - middlePanel:SetRelativePoint(0.0, 0.0); + middlePanel:SetAnchor(0.5, 0.5); + middlePanel:SetRelativePoint(0.5, 0.5); local bottomLeft = panel:NewText("Bottom Left\nLeft Bottom", 0, 0, 0, "DefaultButtonText"); bottomLeft:SetAnchor(0.0, 0.0); @@ -110,6 +110,408 @@ local function Demo() CreateMultiLineDemo(rtCanvas) end +-- ===================================================================== +-- SortingDemo +-- +-- Visual regression case for the upcoming GPU-driven UI renderer (merged +-- Widget pipeline + GPU sort + indirect draw). Run this BEFORE the +-- refactor and screenshot it as a reference, then run it AFTER and +-- compare. Every test must look pixel-identical, modulo deliberate new +-- behaviour: tests 7 and 8 start exercising the layer parameter once +-- GPU sort honors it. +-- +-- Sibling order: siblings are drawn in INSERTION ORDER (first inserted +-- on the bottom, last inserted on top). Several tests use 4+ siblings +-- to verify this directly. +-- ===================================================================== +local function CreateSortingDemo() + -- Local debug colors so we don't pollute Templates.luau. Re-registering an + -- existing template name is a no-op (or overwrite, either way harmless). + UI.RegisterPanelTemplate("SortDemoOrange", { cornerRadius = 0.0, color = vector.create(1.00, 0.55, 0.00) }); + UI.RegisterPanelTemplate("SortDemoYellow", { cornerRadius = 0.0, color = vector.create(1.00, 0.95, 0.00) }); + UI.RegisterPanelTemplate("SortDemoCyan", { cornerRadius = 0.0, color = vector.create(0.00, 0.85, 0.95) }); + UI.RegisterPanelTemplate("SortDemoMagenta", { cornerRadius = 0.0, color = vector.create(1.00, 0.00, 0.80) }); + UI.RegisterPanelTemplate("SortDemoCellBG", { cornerRadius = 0.0, color = vector.create(0.10, 0.10, 0.13) }); + + -- ======================================================================= + -- Test 9 extras: two top-level canvases that overlap inside cell 9's + -- screen-space area. entt's view iterates LAST-CREATED first, so + -- the first-created canvas ends up drawn LAST (on top). To get the + -- desired stack from bottom to top: + -- SortingDemo (bottom, cell 9 background) + -- CanvasA (middle) + -- CanvasB (top) + -- ...we must create them in REVERSE of that order here: + -- CanvasB first, CanvasA next, SortingDemo last. + -- + -- Cell 9 screen bounds (col=2, row=2): x 1284..1902, y 14..344. + -- Panels are placed in y 130..280 so they don't overlap the cell title + -- at ~y 336 or the multi-line hint at ~y 22..110. + -- ======================================================================= + do + -- CanvasB (created FIRST => drawn LAST => on top) + local canvasB : Canvas = UI.GetCanvas("SortingDemo_CanvasB", 0, 0, 1920, 1080) + local bPanel = canvasB:NewPanel(1490, 130, 200, 130, 0, "DebugGreen") + bPanel:SetAnchor(0.0, 0.0); bPanel:SetRelativePoint(0.0, 0.0) + local bChild = bPanel:NewPanel(0, 0, 140, 80, 0, "SortDemoYellow") + bChild:SetAnchor(0.5, 0.5); bChild:SetRelativePoint(0.5, 0.5) + local bText = bPanel:NewText("CanvasB", 10, 10, 0, "DefaultButtonText") + bText:SetAnchor(0.0, 0.0); bText:SetRelativePoint(0.0, 0.0) + + -- CanvasA (created SECOND => drawn in the middle) + local canvasA : Canvas = UI.GetCanvas("SortingDemo_CanvasA", 0, 0, 1920, 1080) + local aPanel = canvasA:NewPanel(1340, 150, 200, 130, 0, "DebugRed") + aPanel:SetAnchor(0.0, 0.0); aPanel:SetRelativePoint(0.0, 0.0) + local aChild = aPanel:NewPanel(0, 0, 140, 80, 0, "DebugBlue") + aChild:SetAnchor(0.5, 0.5); aChild:SetRelativePoint(0.5, 0.5) + local aText = aPanel:NewText("CanvasA", 10, 10, 0, "DefaultButtonText") + aText:SetAnchor(0.0, 0.0); aText:SetRelativePoint(0.0, 0.0) + end + + -- Main SortingDemo canvas. Created LAST of the three so it's iterated + -- first and drawn on the bottom, letting CanvasA and CanvasB show through + -- over cell 9's background. + local canvas : Canvas = UI.GetCanvas("SortingDemo", 0, 0, 1920, 1080); + + -- Top title bar + local title = canvas:NewText("SortingDemo - validate before/after GPU-sort refactor", 0, -6, 0, "DefaultButtonText"); + title:SetAnchor(0.5, 1.0); + title:SetRelativePoint(0.5, 1.0); + + -- 3 columns x 3 rows. row 0 = TOP, row 2 = bottom. col 0 = left, col 2 = right. + -- Numbering reads left-to-right, top-to-bottom (row*3 + col + 1). + local canvasH = 1080 + local topMargin = 28 -- room for title bar above the cells + local cellW, cellH = 618, 330 + local padX, padY = 16, 16 + + local function makeCell(col, row, label) + local x = padX + col * (cellW + padX) + local y = (canvasH - topMargin) - (cellH + padY) * (row + 1) + + -- Cell background frame + local cell = canvas:NewPanel(x, y, cellW, cellH, 0, "SortDemoCellBG") + cell:SetAnchor(0.0, 0.0) + cell:SetRelativePoint(0.0, 0.0) + + -- Cell title (top-left corner of cell) + local titleWidget = cell:NewText(label, 10, -8, 0, "DefaultButtonText") + titleWidget:SetAnchor(0.0, 1.0) + titleWidget:SetRelativePoint(0.0, 1.0) + + return cell + end + + -- ========================================================= + -- Test 1 [top-left]: Sibling panels - 4-step staircase + -- 4 overlapping panels inserted in order R, G, B, Yellow. + -- Verifies the fixed insertion-order iteration with >2 siblings. + -- ========================================================= + do + local cell = makeCell(0, 0, "1: Sibling Panels (x4)") + + local templates = { "DebugRed", "DebugGreen", "DebugBlue", "SortDemoYellow" } + local size = 130 + local step = 18 + local startOffset = -((#templates - 1) * step) / 2 + for i = 1, #templates do + local off = startOffset + (i - 1) * step + local p = cell:NewPanel(off, -off, size, size, 0, templates[i]) + p:SetAnchor(0.5, 0.5); p:SetRelativePoint(0.5, 0.5) + end + + local hintText = [[ +cell: + Red + Green + Blue + Yellow <- on top]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + + -- ========================================================= + -- Test 2 [top-mid]: Sibling text - 4-step insertion order + -- Y step is half the X step so consecutive labels actually + -- overlap vertically (text height ~22px, Y step = 9px). + -- ========================================================= + do + local cell = makeCell(1, 0, "2: Sibling Text (x4)") + + local labels = { "FIRST", "SECOND", "THIRD", "FOURTH" } + local stepX = 18 + local stepY = 9 + local startX = -((#labels - 1) * stepX) / 2 + local startY = -((#labels - 1) * stepY) / 2 + for i = 1, #labels do + local offX = startX + (i - 1) * stepX + local offY = startY + (i - 1) * stepY + local t = cell:NewText(labels[i], offX, -offY, 0, "DefaultButtonText") + t:SetAnchor(0.5, 0.5); t:SetRelativePoint(0.5, 0.5) + end + + local hintText = [[ +cell: + FIRST + SECOND + THIRD + FOURTH <- on top]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + + -- ========================================================= + -- Test 3 [top-right]: Mixed Panel + Text (sibling, both directions) + -- Two pairs in one cell, each isolated from the other: + -- Pair A (left side): Panel inserted FIRST, Text inserted SECOND. + -- Expected: text draws ON TOP of the panel. + -- Pair B (right side): Text inserted FIRST, Panel inserted SECOND. + -- Expected: opaque panel HIDES the text. + -- This is the critical test for the merged Widget pipeline - the + -- old two-pipeline path could not interleave panel and text draws + -- in arbitrary insertion order. + -- ========================================================= + do + local cell = makeCell(2, 0, "3: Mixed P+T sibling (both ways)") + + -- Pair A: panel first, text second (text expected on top) + local pA = cell:NewPanel(-130, 25, 200, 70, 0, "DebugBlue") + pA:SetAnchor(0.5, 0.5); pA:SetRelativePoint(0.5, 0.5) + local tA = cell:NewText("Text over Blue", -130, 25, 0, "DefaultButtonText") + tA:SetAnchor(0.5, 0.5); tA:SetRelativePoint(0.5, 0.5) + + -- Pair B: text first, panel second (panel expected to hide text) + local tB = cell:NewText("Hidden by Red", 130, -25, 0, "DefaultButtonText") + tB:SetAnchor(0.5, 0.5); tB:SetRelativePoint(0.5, 0.5) + local pB = cell:NewPanel(130, -25, 200, 70, 0, "DebugRed") + pB:SetAnchor(0.5, 0.5); pB:SetRelativePoint(0.5, 0.5) + + local hintText = [[ +cell: + Blue panel + "Text over Blue" <- VISIBLE + + "Hidden by Red" <- INVISIBLE + Red panel]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + + -- ========================================================= + -- Test 4 [mid-left]: Hierarchy depth (single-child chain) + -- Single-child means there's no sibling order at all; each + -- inner level cleanly draws on top of its parent. Tests + -- depth-first traversal of a deep chain. + -- ========================================================= + do + local cell = makeCell(0, 1, "4: Hierarchy Depth") + + local p1 = cell:NewPanel(0, 0, 180, 180, 0, "DebugRed") + p1:SetAnchor(0.5, 0.5); p1:SetRelativePoint(0.5, 0.5) + + local p2 = p1:NewPanel(0, 0, 130, 130, 0, "DebugGreen") + p2:SetAnchor(0.5, 0.5); p2:SetRelativePoint(0.5, 0.5) + + local p3 = p2:NewPanel(0, 0, 90, 90, 0, "DebugBlue") + p3:SetAnchor(0.5, 0.5); p3:SetRelativePoint(0.5, 0.5) + + local p4 = p3:NewPanel(0, 0, 50, 50, 0, "SortDemoYellow") + p4:SetAnchor(0.5, 0.5); p4:SetRelativePoint(0.5, 0.5) + + local label = p4:NewText("DEEP", 0, 0, 0, "DefaultDebugText") + label:SetAnchor(0.5, 0.5); label:SetRelativePoint(0.5, 0.5) + + local hintText = [[ +cell: + Red + Green + Blue + Yellow + "DEEP"]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + + -- ========================================================= + -- Test 5 [mid-mid]: Branching hierarchy + -- A single parent panel with two non-overlapping child subtrees. + -- Each child has its own grandchild text. Verifies depth-first + -- traversal across MULTIPLE BRANCHES at depth >= 2: + -- parent -> leftChild -> leftText -> rightChild -> rightText + -- ========================================================= + do + local cell = makeCell(1, 1, "5: Branching Hierarchy") + + local parent = cell:NewPanel(0, 0, 414, 153, 0, "DebugDarkGrey") + parent:SetAnchor(0.5, 0.5); parent:SetRelativePoint(0.5, 0.5) + + local leftChild = parent:NewPanel(-105, 0, 180, 120, 0, "DebugRed") + leftChild:SetAnchor(0.5, 0.5); leftChild:SetRelativePoint(0.5, 0.5) + local leftText = leftChild:NewText("LEFT", 0, 0, 0, "DefaultButtonText") + leftText:SetAnchor(0.5, 0.5); leftText:SetRelativePoint(0.5, 0.5) + + local rightChild = parent:NewPanel(105, 0, 180, 120, 0, "DebugBlue") + rightChild:SetAnchor(0.5, 0.5); rightChild:SetRelativePoint(0.5, 0.5) + local rightText = rightChild:NewText("RIGHT", 0, 0, 0, "DefaultButtonText") + rightText:SetAnchor(0.5, 0.5); rightText:SetRelativePoint(0.5, 0.5) + + local hintText = [[ +cell: + DarkGrey parent + Red leftChild + "LEFT" + Blue rightChild + "RIGHT"]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + + -- ========================================================= + -- Test 6 [mid-right]: Two top-level subtrees that overlap + -- Two parent panels (A then B) inserted as siblings of the + -- cell, each containing a child panel and a child text. Verifies + -- that the entire B subtree draws over the entire A subtree, not + -- interleaved at the leaves. + -- ========================================================= + do + local cell = makeCell(2, 1, "6: Two-Parent Subtrees") + + -- Parent A (DarkGrey base) inserted first + local parentA = cell:NewPanel(-50, -15, 240, 160, 0, "DebugDarkGrey") + parentA:SetAnchor(0.5, 0.5); parentA:SetRelativePoint(0.5, 0.5) + local aChildPanel = parentA:NewPanel(0, 0, 170, 110, 0, "DebugRed") + aChildPanel:SetAnchor(0.5, 0.5); aChildPanel:SetRelativePoint(0.5, 0.5) + local aChildText = parentA:NewText("A.text", 0, 0, 0, "DefaultButtonText") + aChildText:SetAnchor(0.5, 0.5); aChildText:SetRelativePoint(0.5, 0.5) + + -- Parent B (Cyan tinted) inserted second, overlapping parent A + local parentB = cell:NewPanel(50, 15, 240, 160, 0, "SortDemoCyan") + parentB:SetAnchor(0.5, 0.5); parentB:SetRelativePoint(0.5, 0.5) + local bChildPanel = parentB:NewPanel(0, 0, 170, 110, 0, "DebugBlue") + bChildPanel:SetAnchor(0.5, 0.5); bChildPanel:SetRelativePoint(0.5, 0.5) + local bChildText = parentB:NewText("B.text", 0, 0, 0, "DefaultButtonText") + bChildText:SetAnchor(0.5, 0.5); bChildText:SetRelativePoint(0.5, 0.5) + + local hintText = [[ +cell: + DarkGrey parentA + Red child + "A.text" + Cyan parentB + Blue child + "B.text"]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + + -- ========================================================= + -- Test 7 [bot-left]: Z-index (layer) - 3 panels, gradient sort + -- Three sibling panels with three DIFFERENT layer values to + -- verify the post-refactor layer sort handles a gradient (not + -- just a binary high/low). The MIDDLE panel has the highest + -- layer, so post-refactor it should be on top despite being + -- inserted second. + -- This is the ONE test whose visual is expected to change after + -- the refactor. + -- ========================================================= + do + local cell = makeCell(0, 2, "7: Z-index (3 layers)") + + local pRed = cell:NewPanel(-50, 20, 160, 160, 50, "DebugRed") + pRed:SetAnchor(0.5, 0.5); pRed:SetRelativePoint(0.5, 0.5) + + local pGreen = cell:NewPanel( 0, 0, 160, 160, 100, "DebugGreen") + pGreen:SetAnchor(0.5, 0.5); pGreen:SetRelativePoint(0.5, 0.5) + pGreen:SetAlpha(0.5) -- semi-transparent so we can see whether Red or Blue is stacked under it + + local pBlue = cell:NewPanel( 50, -20, 160, 160, 25, "DebugBlue") + pBlue:SetAnchor(0.5, 0.5); pBlue:SetRelativePoint(0.5, 0.5) + + local hintText = [[ +cell: + Red (layer 50, 1st) + Green (layer 100, 2nd) + Blue (layer 25, 3rd) + +BEFORE: Blue on top (insertion order) +AFTER : Green on top, Red middle, Blue bottom + (sorted by layer descending)]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + + -- ========================================================= + -- Test 8 [bot-mid]: Combined nesting + siblings + layers + -- A parent panel with two child subtrees, where the FIRST + -- subtree is given a high layer. Pre-refactor: insertion order + -- means the second subtree covers the first. Post-refactor: + -- the layer-elevated subtree should rise above the second. + -- Forces the sort design to commit to "does layer cross + -- subtree boundaries?" - if yes, the whole Red subtree (panel + -- + text) should appear on top, including its child text. + -- ========================================================= + do + local cell = makeCell(1, 2, "8: Nest + Siblings + Layers") + + local parent = cell:NewPanel(0, 10, 460, 170, 0, "DebugDarkGrey") + parent:SetAnchor(0.5, 0.5); parent:SetRelativePoint(0.5, 0.5) + + -- Red subtree, inserted FIRST, layer 100 + local redChild = parent:NewPanel(-70, 0, 200, 110, 100, "DebugRed") + redChild:SetAnchor(0.5, 0.5); redChild:SetRelativePoint(0.5, 0.5) + local redText = redChild:NewText("L", 0, 0, 0, "DefaultButtonText") + redText:SetAnchor(0.5, 0.5); redText:SetRelativePoint(0.5, 0.5) + + -- Green subtree, inserted SECOND, layer 0, OVERLAPS Red + local greenChild = parent:NewPanel(70, 0, 200, 110, 0, "DebugGreen") + greenChild:SetAnchor(0.5, 0.5); greenChild:SetRelativePoint(0.5, 0.5) + local greenText = greenChild:NewText("R", 0, 0, 0, "DefaultButtonText") + greenText:SetAnchor(0.5, 0.5); greenText:SetRelativePoint(0.5, 0.5) + + local hintText = [[ +cell: + DarkGrey parent + Red (layer 100, 1st) + "L" + Green (layer 0, 2nd) + "R" + +BEFORE: Green's subtree covers Red in overlap +AFTER : Red's subtree on top (layer 100)]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + + -- ========================================================= + -- Test 9 [bot-right]: Two overlapping canvases + -- Cell 9 itself just contains an explanatory label. The actual + -- test is two extra non-RT canvases (CanvasA and CanvasB) created + -- AFTER this cell at the top level - they overlap each other in + -- screen space within the bot-right cell area. + -- + -- Demonstrates that canvases render WHOLE one at a time: + -- everything in CanvasB draws above everything in CanvasA in the + -- overlap zone, never interleaved at the leaf level. (Once + -- per-canvas Z-index lands, that priority will determine which + -- canvas wins; today it's strictly creation order.) + -- ========================================================= + do + local cell = makeCell(2, 2, "9: Two Canvases (overlap)") + + local hintText = [[ +Two extra non-RT canvases overlap inside +this cell area. CanvasA was created first, +CanvasB second. + +CanvasB's entire contents draw on top of +CanvasA in the overlap zone - nothing from +the two canvases interleaves at the leaves.]] + local hint = cell:NewText(hintText, 10, 8, 0, "DefaultDebugText") + hint:SetAnchor(0.0, 0.0); hint:SetRelativePoint(0.0, 0.0) + end + +end + local function CreateClippingDemo() local canvas = UI.GetCanvas("ClippingDemo", 0, 0, 1920, 1080); @@ -140,6 +542,7 @@ local function OnGameLoaded(eventID : number, data : any) --CreateClippingDemo(); --Demo(); + --CreateSortingDemo(); --CreateGameMenu(stack); --CreateOptionsMenu(stack); diff --git a/Source/Shaders/Shaders/Include/Lighting.inc.slang b/Source/Shaders/Shaders/Include/Lighting.inc.slang index 9e5ca247..591166cf 100644 --- a/Source/Shaders/Shaders/Include/Lighting.inc.slang +++ b/Source/Shaders/Shaders/Include/Lighting.inc.slang @@ -39,9 +39,6 @@ float3 ApplyLighting(float2 uv, float3 materialColor, PixelVertexData pixelVerte { DirectionalLight light = LoadDirectionalLight(i); - light.groundAmbientColor.rgb = float3(0.4f, 0.4f, 0.4f); - light.skyAmbientColor.rgb = float3(0.4f, 0.4f, 0.4f); - // Ambient Light float nDotUp = saturate(dot(pixelVertexData.worldNormal, float3(0.0f, 1.0f, 0.0f))); // Dot product between normal and up direction float4 lightAmbientColor = lerp(light.groundAmbientColor, light.skyAmbientColor, nDotUp); // Ambient color based on normal diff --git a/Source/Shaders/Shaders/Sorting/FFX_ParallelSort.inc.slang b/Source/Shaders/Shaders/Sorting/FFX_ParallelSort.inc.slang deleted file mode 100644 index 079c31d1..00000000 --- a/Source/Shaders/Shaders/Sorting/FFX_ParallelSort.inc.slang +++ /dev/null @@ -1,454 +0,0 @@ -// FFX_ParallelSort.h -// -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#define FFX_PARALLELSORT_SORT_BITS_PER_PASS 4 -#define FFX_PARALLELSORT_SORT_BIN_COUNT (1 << FFX_PARALLELSORT_SORT_BITS_PER_PASS) -#define FFX_PARALLELSORT_ELEMENTS_PER_THREAD 4 -#define FFX_PARALLELSORT_THREADGROUP_SIZE 128 - -////////////////////////////////////////////////////////////////////////// -// ParallelSort constant buffer parameters: -// -// NumKeys The number of keys to sort -// Shift How many bits to shift for this sort pass (we sort 4 bits at a time) -// NumBlocksPerThreadGroup How many blocks of keys each thread group needs to process -// NumThreadGroups How many thread groups are being run concurrently for sort -// NumThreadGroupsWithAdditionalBlocks How many thread groups need to process additional block data -// NumReduceThreadgroupPerBin How many thread groups are summed together for each reduced bin entry -// NumScanValues How many values to perform scan prefix (+ add) on -////////////////////////////////////////////////////////////////////////// - -struct FFX_ParallelSortCB -{ - uint NumKeys; - int NumBlocksPerThreadGroup; - uint NumThreadGroups; - uint NumThreadGroupsWithAdditionalBlocks; - uint NumReduceThreadgroupPerBin; - uint NumScanValues; -}; - -groupshared uint gs_Histogram[FFX_PARALLELSORT_THREADGROUP_SIZE * FFX_PARALLELSORT_SORT_BIN_COUNT]; -void FFX_ParallelSort_Count_uint(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, uint ShiftBit, RWStructuredBuffer SrcBuffer, RWStructuredBuffer SumTable) -{ - // Start by clearing our local counts in LDS - for (int i = 0; i < FFX_PARALLELSORT_SORT_BIN_COUNT; i++) - gs_Histogram[(i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID] = 0; - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Data is processed in blocks, and how many we process can changed based on how much data we are processing - // versus how many thread groups we are processing with - int BlockSize = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; - - // Figure out this thread group's index into the block data (taking into account thread groups that need to do extra reads) - uint ThreadgroupBlockStart = (BlockSize * CBuffer.NumBlocksPerThreadGroup * groupID); - uint NumBlocksToProcess = CBuffer.NumBlocksPerThreadGroup; - - if (groupID >= CBuffer.NumThreadGroups - CBuffer.NumThreadGroupsWithAdditionalBlocks) - { - ThreadgroupBlockStart += (groupID - (CBuffer.NumThreadGroups - CBuffer.NumThreadGroupsWithAdditionalBlocks)) * BlockSize; - NumBlocksToProcess++; - } - - // Get the block start index for this thread - uint BlockIndex = ThreadgroupBlockStart + localID; - - // Count value occurrence - for (uint BlockCount = 0; BlockCount < NumBlocksToProcess; BlockCount++, BlockIndex += BlockSize) - { - uint DataIndex = BlockIndex; - - // Pre-load the key values in order to hide some of the read latency - uint64_t srcKeys[FFX_PARALLELSORT_ELEMENTS_PER_THREAD]; - srcKeys[0] = SrcBuffer[DataIndex]; - srcKeys[1] = SrcBuffer[DataIndex + FFX_PARALLELSORT_THREADGROUP_SIZE]; - srcKeys[2] = SrcBuffer[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 2)]; - srcKeys[3] = SrcBuffer[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 3)]; - - for (uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) - { - if (DataIndex < CBuffer.NumKeys) - { - uint64_t localKey = (srcKeys[i] >> ShiftBit) & 0xf; - InterlockedAdd(gs_Histogram[(localKey * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID], 1l); - DataIndex += FFX_PARALLELSORT_THREADGROUP_SIZE; - } - } - } - - // Even though our LDS layout guarantees no collisions, our thread group size is greater than a wave - // so we need to make sure all thread groups are done counting before we start tallying up the results - GroupMemoryBarrierWithGroupSync(); - - if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) - { - uint64_t sum = 0; - for (int i = 0; i < FFX_PARALLELSORT_THREADGROUP_SIZE; i++) - { - sum += gs_Histogram[localID * FFX_PARALLELSORT_THREADGROUP_SIZE + i]; - } - SumTable[localID * CBuffer.NumThreadGroups + groupID] = sum; - } -} - -groupshared uint64_t gs_LDSSums[FFX_PARALLELSORT_THREADGROUP_SIZE]; -uint64_t FFX_ParallelSort_ThreadgroupReduce(uint64_t localSum, uint localID) -{ - // Do wave local reduce - uint64_t waveReduced = WaveActiveSum(localSum); - - // First lane in a wave writes out wave reduction to LDS (this accounts for num waves per group greater than HW wave size) - // Note that some hardware with very small HW wave sizes (i.e. <= 8) may exhibit issues with this algorithm, and have not been tested. - uint waveID = localID / WaveGetLaneCount(); - if (WaveIsFirstLane()) - gs_LDSSums[waveID] = waveReduced; - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // First wave worth of threads sum up wave reductions - if (!waveID) - waveReduced = WaveActiveSum( (localID < FFX_PARALLELSORT_THREADGROUP_SIZE / WaveGetLaneCount()) ? gs_LDSSums[localID] : 0); - - // Returned the reduced sum - return waveReduced; -} - -uint64_t FFX_ParallelSort_BlockScanPrefix(uint64_t localSum, uint localID) -{ - // Do wave local scan-prefix - uint64_t wavePrefixed = WavePrefixSum(localSum); - - // Since we are dealing with thread group sizes greater than HW wave size, we need to account for what wave we are in. - uint waveID = localID / WaveGetLaneCount(); - uint laneID = WaveGetLaneIndex(); - - // Last element in a wave writes out partial sum to LDS - if (laneID == WaveGetLaneCount() - 1) - gs_LDSSums[waveID] = wavePrefixed + localSum; - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // First wave prefixes partial sums - if (!waveID) - gs_LDSSums[localID] = WavePrefixSum(gs_LDSSums[localID]); - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Add the partial sums back to each wave prefix - wavePrefixed += gs_LDSSums[waveID]; - - return wavePrefixed; -} - -void FFX_ParallelSort_ReduceCount(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, RWStructuredBuffer SumTable, RWStructuredBuffer ReduceTable) -{ - // Figure out what bin data we are reducing - uint BinID = groupID / CBuffer.NumReduceThreadgroupPerBin; - uint BinOffset = BinID * CBuffer.NumThreadGroups; - - // Get the base index for this thread group - uint BaseIndex = (groupID % CBuffer.NumReduceThreadgroupPerBin) * FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; - - // Calculate partial sums for entries this thread reads in - uint64_t threadgroupSum = 0; - for (uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; ++i) - { - uint DataIndex = BaseIndex + (i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID; - threadgroupSum += (DataIndex < CBuffer.NumThreadGroups) ? SumTable[BinOffset + DataIndex] : 0; - } - - // Reduce across the entirety of the thread group - threadgroupSum = FFX_ParallelSort_ThreadgroupReduce(threadgroupSum, localID); - - // First thread of the group writes out the reduced sum for the bin - if (!localID) - ReduceTable[groupID] = threadgroupSum; - - // What this will look like in the reduced table is: - // [ [bin0 ... bin0] [bin1 ... bin1] ... ] -} - -// This is to transform uncoalesced loads into coalesced loads and -// then scattered loads from LDS -groupshared uint64_t gs_LDS[FFX_PARALLELSORT_ELEMENTS_PER_THREAD][FFX_PARALLELSORT_THREADGROUP_SIZE]; -void FFX_ParallelSort_ScanPrefix(uint numValuesToScan, uint localID, uint groupID, uint BinOffset, uint BaseIndex, bool AddPartialSums, - FFX_ParallelSortCB CBuffer, RWStructuredBuffer ScanSrc, RWStructuredBuffer ScanDst, RWStructuredBuffer ScanScratch) -{ - // Perform coalesced loads into LDS - { - for(uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) - { - uint DataIndex = BaseIndex + (i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID; - - uint col = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) / FFX_PARALLELSORT_ELEMENTS_PER_THREAD; - uint row = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) % FFX_PARALLELSORT_ELEMENTS_PER_THREAD; - gs_LDS[row][col] = (DataIndex < numValuesToScan) ? ScanSrc[BinOffset + DataIndex] : 0; - } - } - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - uint64_t threadgroupSum = 0; - // Calculate the local scan-prefix for current thread - { - for(uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) - { - uint64_t tmp = gs_LDS[i][localID]; - gs_LDS[i][localID] = threadgroupSum; - threadgroupSum += tmp; - } - } - - // Scan prefix partial sums - threadgroupSum = FFX_ParallelSort_BlockScanPrefix(threadgroupSum, localID); - - // Add reduced partial sums if requested - uint64_t partialSum = 0; - if (AddPartialSums) - { - // Partial sum additions are a little special as they are tailored to the optimal number of - // thread groups we ran in the beginning, so need to take that into account - partialSum = ScanScratch[groupID]; - } - - // Add the block scanned-prefixes back in - { - for(uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) - gs_LDS[i][localID] += threadgroupSum; - } - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Perform coalesced writes to scan dst - { - for(uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) - { - uint DataIndex = BaseIndex + (i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID; - - uint col = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) / FFX_PARALLELSORT_ELEMENTS_PER_THREAD; - uint row = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) % FFX_PARALLELSORT_ELEMENTS_PER_THREAD; - - if(DataIndex < numValuesToScan) - ScanDst[BinOffset + DataIndex] = gs_LDS[row][col] + partialSum; - } - } -} - -// Offset cache to avoid loading the offsets all the time -groupshared uint64_t gs_BinOffsetCache[FFX_PARALLELSORT_THREADGROUP_SIZE]; -// Local histogram for offset calculations -groupshared uint gs_LocalHistogram[FFX_PARALLELSORT_SORT_BIN_COUNT]; -// Scratch area for algorithm -groupshared uint64_t gs_LDSScratch[FFX_PARALLELSORT_THREADGROUP_SIZE]; -// LDS for uint sums -groupshared uint gs_LDSSums_uint[FFX_PARALLELSORT_THREADGROUP_SIZE]; -void FFX_ParallelSort_Scatter_uint(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, uint ShiftBit, RWStructuredBuffer SrcBuffer, RWStructuredBuffer DstBuffer, - RWStructuredBuffer SumTable, RWStructuredBuffer SrcPayload, RWStructuredBuffer DstPayload) -{ - // Load the sort bin threadgroup offsets into LDS for faster referencing - if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) - gs_BinOffsetCache[localID] = SumTable[localID * CBuffer.NumThreadGroups + groupID]; - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Data is processed in blocks, and how many we process can changed based on how much data we are processing - // versus how many thread groups we are processing with - int BlockSize = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; - - // Figure out this thread group's index into the block data (taking into account thread groups that need to do extra reads) - uint ThreadgroupBlockStart = (BlockSize * CBuffer.NumBlocksPerThreadGroup * groupID); - uint NumBlocksToProcess = CBuffer.NumBlocksPerThreadGroup; - - if (groupID >= CBuffer.NumThreadGroups - CBuffer.NumThreadGroupsWithAdditionalBlocks) - { - ThreadgroupBlockStart += (groupID - (CBuffer.NumThreadGroups - CBuffer.NumThreadGroupsWithAdditionalBlocks)) * BlockSize; - NumBlocksToProcess++; - } - - // Get the block start index for this thread - uint BlockIndex = ThreadgroupBlockStart + localID; - - // Count value occurences - uint newCount; - for (int BlockCount = 0; BlockCount < NumBlocksToProcess; BlockCount++, BlockIndex += BlockSize) - { - uint DataIndex = BlockIndex; - - // Pre-load the key values in order to hide some of the read latency - uint64_t srcKeys[FFX_PARALLELSORT_ELEMENTS_PER_THREAD]; - srcKeys[0] = SrcBuffer[DataIndex]; - srcKeys[1] = SrcBuffer[DataIndex + FFX_PARALLELSORT_THREADGROUP_SIZE]; - srcKeys[2] = SrcBuffer[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 2)]; - srcKeys[3] = SrcBuffer[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 3)]; - - uint srcValues[FFX_PARALLELSORT_ELEMENTS_PER_THREAD]; - srcValues[0] = SrcPayload[DataIndex]; - srcValues[1] = SrcPayload[DataIndex + FFX_PARALLELSORT_THREADGROUP_SIZE]; - srcValues[2] = SrcPayload[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 2)]; - srcValues[3] = SrcPayload[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 3)]; - - for (int i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) - { - // Clear the local histogram - if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) - gs_LocalHistogram[localID] = 0; - - uint64_t localKey = (DataIndex < CBuffer.NumKeys ? srcKeys[i] : 0xffffffffffffffff); - uint localValue = (DataIndex < CBuffer.NumKeys ? srcValues[i] : 0); - - // Sort the keys locally in LDS - for (uint bitShift = 0; bitShift < FFX_PARALLELSORT_SORT_BITS_PER_PASS; bitShift += 2) - { - // Figure out the keyIndex - uint64_t keyIndex = (localKey >> ShiftBit) & 0xf; - uint64_t bitKey = (keyIndex >> bitShift) & 0x3; - - // Create a packed histogram - uint64_t packedHistogram = (uint64_t)1 << (bitKey * 8); - - // Sum up all the packed keys (generates counted offsets up to current thread group) - uint64_t localSum = FFX_ParallelSort_BlockScanPrefix(packedHistogram, localID); - - // Last thread stores the updated histogram counts for the thread group - // Scratch = 0xsum3|sum2|sum1|sum0 for thread group - if (localID == (FFX_PARALLELSORT_THREADGROUP_SIZE - 1)) - gs_LDSScratch[0] = localSum + packedHistogram; - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Load the sums value for the thread group - packedHistogram = gs_LDSScratch[0]; - - // Add prefix offsets for all 4 bit "keys" (packedHistogram = 0xsum2_1_0|sum1_0|sum0|0) - packedHistogram = (packedHistogram << 8) + (packedHistogram << 16) + (packedHistogram << 24); - - // Calculate the proper offset for this thread's value - localSum += packedHistogram; - - // Calculate target offset - uint64_t keyOffset = (localSum >> (bitKey * 8)) & 0xff; - - // Re-arrange the keys (store, sync, load) - gs_LDSSums[keyOffset] = localKey; - GroupMemoryBarrierWithGroupSync(); - localKey = gs_LDSSums[localID]; - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Re-arrange the values if we have them (store, sync, load) - gs_LDSSums_uint[keyOffset] = localValue; - GroupMemoryBarrierWithGroupSync(); - localValue = gs_LDSSums_uint[localID]; - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - } - - // Need to recalculate the keyIndex on this thread now that values have been copied around the thread group - uint64_t keyIndex = (localKey >> ShiftBit) & 0xf; - - // Reconstruct histogram - InterlockedAdd(gs_LocalHistogram[keyIndex], 1); - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Prefix histogram - uint histogramPrefixSum = WavePrefixSum(localID < FFX_PARALLELSORT_SORT_BIN_COUNT ? gs_LocalHistogram[localID] : 0); - - // Broadcast prefix-sum via LDS - if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) - gs_LDSScratch[localID] = histogramPrefixSum; - - // Get the global offset for this key out of the cache - uint64_t globalOffset = gs_BinOffsetCache[keyIndex]; - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Get the local offset (at this point the keys are all in increasing order from 0 -> num bins in localID 0 -> thread group size) - uint64_t localOffset = localID - gs_LDSScratch[keyIndex]; - - // Write to destination - uint totalOffset = uint(globalOffset + localOffset); - - if (totalOffset < CBuffer.NumKeys) - { - DstBuffer[totalOffset] = localKey; - DstPayload[totalOffset] = localValue; - } - - // Wait for everyone to catch up - GroupMemoryBarrierWithGroupSync(); - - // Update the cached histogram for the next set of entries - if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) - gs_BinOffsetCache[localID] += gs_LocalHistogram[localID]; - - DataIndex += FFX_PARALLELSORT_THREADGROUP_SIZE; // Increase the data offset by thread group size - } - } -} - -void FFX_ParallelSort_SetupIndirectParams(uint NumKeys, uint MaxThreadGroups, RWStructuredBuffer CBuffer, RWStructuredBuffer CountScatterArgs, RWStructuredBuffer ReduceScanArgs) -{ - CBuffer[0].NumKeys = NumKeys; - - uint BlockSize = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; - uint NumBlocks = (NumKeys + BlockSize - 1) / BlockSize; - - // Figure out data distribution - uint NumThreadGroupsToRun = MaxThreadGroups; - uint BlocksPerThreadGroup = (NumBlocks / NumThreadGroupsToRun); - CBuffer[0].NumThreadGroupsWithAdditionalBlocks = NumBlocks % NumThreadGroupsToRun; - - if (NumBlocks < NumThreadGroupsToRun) - { - BlocksPerThreadGroup = 1; - NumThreadGroupsToRun = NumBlocks; - CBuffer[0].NumThreadGroupsWithAdditionalBlocks = 0; - } - - CBuffer[0].NumThreadGroups = NumThreadGroupsToRun; - CBuffer[0].NumBlocksPerThreadGroup = BlocksPerThreadGroup; - - // Calculate the number of thread groups to run for reduction (each thread group can process BlockSize number of entries) - uint NumReducedThreadGroupsToRun = FFX_PARALLELSORT_SORT_BIN_COUNT * ((BlockSize > NumThreadGroupsToRun) ? 1 : (NumThreadGroupsToRun + BlockSize - 1) / BlockSize); - CBuffer[0].NumReduceThreadgroupPerBin = NumReducedThreadGroupsToRun / FFX_PARALLELSORT_SORT_BIN_COUNT; - CBuffer[0].NumScanValues = NumReducedThreadGroupsToRun; // The number of reduce thread groups becomes our scan count (as each thread group writes out 1 value that needs scan prefix) - - // Setup dispatch arguments - CountScatterArgs[0] = NumThreadGroupsToRun; - CountScatterArgs[1] = 1; - CountScatterArgs[2] = 1; - - ReduceScanArgs[0] = NumReducedThreadGroupsToRun; - ReduceScanArgs[1] = 1; - ReduceScanArgs[2] = 1; -} diff --git a/Source/Shaders/Shaders/Sorting/Radix/Constants.inc.slang b/Source/Shaders/Shaders/Sorting/Radix/Constants.inc.slang new file mode 100644 index 00000000..475c0d80 --- /dev/null +++ b/Source/Shaders/Shaders/Sorting/Radix/Constants.inc.slang @@ -0,0 +1,7 @@ +#pragma once + +static const uint RADIX = 256; +static const uint WORKGROUP_SIZE = 512; +static const uint PARTITION_DIVISION = 8; +static const uint PARTITION_SIZE = PARTITION_DIVISION * WORKGROUP_SIZE; +static const uint MAX_SUBGROUP_SIZE = 128; diff --git a/Source/Shaders/Shaders/Sorting/Radix/Downsweep.cs.slang b/Source/Shaders/Shaders/Sorting/Radix/Downsweep.cs.slang new file mode 100644 index 00000000..149cc652 --- /dev/null +++ b/Source/Shaders/Shaders/Sorting/Radix/Downsweep.cs.slang @@ -0,0 +1,235 @@ +permutation KEY_VALUE = [0, 1]; +// Reduce-then-scan radix sort -- downsweep pass. +// Consumes the scanned histograms to scatter keys (and values, if KEY_VALUE=1) into their +// globally sorted positions. Dispatched with `(partitionCount, 1, 1)` groups, WORKGROUP_SIZE +// threads each. + +#include "Sorting/Radix/Constants.inc.slang" + +struct RadixPushConstants +{ + uint pass; + uint elementCount; +}; +[[vk::push_constant]] RadixPushConstants _radixPC; + +[[vk::binding(1, PER_PASS)]] RWStructuredBuffer globalHistogram; +[[vk::binding(2, PER_PASS)]] RWStructuredBuffer partitionHistogram; +[[vk::binding(3, PER_PASS)]] RWStructuredBuffer keysIn; +[[vk::binding(4, PER_PASS)]] RWStructuredBuffer keysOut; +#if KEY_VALUE +[[vk::binding(5, PER_PASS)]] RWStructuredBuffer valuesIn; +[[vk::binding(6, PER_PASS)]] RWStructuredBuffer valuesOut; +#endif + +groupshared uint localHistogram[PARTITION_SIZE]; // (R, S=16)=4096, (P) for alias. take maximum. +groupshared uint localHistogramSum[RADIX]; + +// returns 0b00000....11111, where msb is id-1. +uint4 GetExclusiveWaveMask(uint id) +{ + // clamp bit-shift right operand between 0..31 to avoid undefined behavior. + uint shift = (1 << bitfieldExtract(id, 0, 5)) - 1; // (1 << (id % 32)) - 1 + // right shift operation on signed integer copies sign bit, use the trick for masking. + // (negative) >> 31 = 111...111 + // (non-negative) >> 31 = 000...000 + int x = int(id) >> 5; + return uint4((shift & ((-1 - x) >> 31)) | ((0 - x) >> 31), // + (shift & ((0 - x) >> 31)) | ((1 - x) >> 31), // + (shift & ((1 - x) >> 31)) | ((2 - x) >> 31), // + (shift & ((2 - x) >> 31)) | ((3 - x) >> 31)); +} + +uint GetBitCount(uint4 value) +{ + uint4 result = countbits(value); + return result[0] + result[1] + result[2] + result[3]; +} + +[shader("compute")] +[numthreads(WORKGROUP_SIZE)] +void main(uint3 groupThreadID: SV_GroupThreadID, uint3 groupId: SV_GroupID, + uint groupIndex: SV_GroupIndex) +{ + const uint pass = _radixPC.pass; + const uint elementCount = _radixPC.elementCount; + + uint laneIndex = WaveGetLaneIndex(); // 0..31 or 0..63 + uint laneCount = WaveGetLaneCount(); // 32 or 64 + uint waveIndex = groupIndex / laneCount; // 0..15 or 0..7 + uint waveCount = WORKGROUP_SIZE / laneCount; // 32 or 16 + uint index = waveIndex * laneCount + laneIndex; + + uint4 waveMask = GetExclusiveWaveMask(laneIndex); + + uint partitionIndex = groupId.x; + uint partitionStart = partitionIndex * PARTITION_SIZE; + + if (partitionStart >= elementCount) + return; + + if (index < RADIX) { + for (int i = 0; i < waveCount; ++i) { + localHistogram[waveCount * index + i] = 0; + } + } + GroupMemoryBarrierWithGroupSync(); + + // load from global memory, local histogram and offset + uint localKeys[PARTITION_DIVISION]; + uint localRadix[PARTITION_DIVISION]; + uint localOffsets[PARTITION_DIVISION]; + uint waveHistogram[PARTITION_DIVISION]; +#if KEY_VALUE + uint localValues[PARTITION_DIVISION]; +#endif + + [ForceUnroll] + for (int i = 0; i < PARTITION_DIVISION; ++i) { + uint keyIndex = + partitionStart + (PARTITION_DIVISION * laneCount) * waveIndex + i * laneCount + laneIndex; + uint key = keyIndex < elementCount ? keysIn[keyIndex] : 0xffffffff; + localKeys[i] = key; + +#if KEY_VALUE + localValues[i] = keyIndex < elementCount ? valuesIn[keyIndex] : 0; +#endif + + uint radix = bitfieldExtract(key, pass * 8, 8); + localRadix[i] = radix; + + // mask per digit + uint4 mask = WaveActiveBallot(true); + [ForceUnroll] + for (int j = 0; j < 8; ++j) { + uint digit = (radix >> j) & 1; + uint4 ballot = WaveActiveBallot(digit == 1); + // digit - 1 is 0 or 0xffffffff. xor to flip. + mask &= uint4(digit - 1) ^ ballot; + } + + // wave level offset for radix + uint waveOffset = GetBitCount(waveMask & mask); + uint radixCount = GetBitCount(mask); + + // elect a representative per radix, add to histogram + if (waveOffset == 0) { + // accumulate to local histogram + __atomic_add(localHistogram[waveCount * radix + waveIndex], radixCount, MemoryOrder.Relaxed); + waveHistogram[i] = radixCount; + } else { + waveHistogram[i] = 0; + } + + localOffsets[i] = waveOffset; + } + GroupMemoryBarrierWithGroupSync(); + + // local histogram reduce 4096 or 2048 + for (uint i = index; i < RADIX * waveCount; i += WORKGROUP_SIZE) { + uint v = localHistogram[i]; + uint sum = WaveActiveSum(v); + uint excl = WavePrefixSum(v); + localHistogram[i] = excl; + if (laneIndex == 0) { + localHistogramSum[i / laneCount] = sum; + } + } + GroupMemoryBarrierWithGroupSync(); + + // local histogram reduce 128 or 32 + uint intermediateOffset0 = RADIX * waveCount / laneCount; + if (index < intermediateOffset0) { + uint v = localHistogramSum[index]; + uint sum = WaveActiveSum(v); + uint excl = WavePrefixSum(v); + localHistogramSum[index] = excl; + if (laneIndex == 0) { + localHistogramSum[intermediateOffset0 + index / laneCount] = sum; + } + } + GroupMemoryBarrierWithGroupSync(); + + // local histogram reduce 4 or 1 + uint intermediateSize1 = max(RADIX * waveCount / laneCount / laneCount, 1); + if (index < intermediateSize1) { + uint v = localHistogramSum[intermediateOffset0 + index]; + uint excl = WavePrefixSum(v); + localHistogramSum[intermediateOffset0 + index] = excl; + } + GroupMemoryBarrierWithGroupSync(); + + // local histogram add 128 + if (index < intermediateOffset0) { + localHistogramSum[index] += localHistogramSum[intermediateOffset0 + index / laneCount]; + } + GroupMemoryBarrierWithGroupSync(); + + // local histogram add 4096 + for (uint i = index; i < RADIX * waveCount; i += WORKGROUP_SIZE) { + localHistogram[i] += localHistogramSum[i / laneCount]; + } + GroupMemoryBarrierWithGroupSync(); + + // post-scan stage + [ForceUnroll] + for (int i = 0; i < PARTITION_DIVISION; ++i) { + uint radix = localRadix[i]; + localOffsets[i] += localHistogram[waveCount * radix + waveIndex]; + + GroupMemoryBarrierWithGroupSync(); + if (waveHistogram[i] > 0) { + __atomic_add(localHistogram[waveCount * radix + waveIndex], waveHistogram[i], + MemoryOrder.Relaxed); + } + GroupMemoryBarrierWithGroupSync(); + } + + // after atomicAdd, localHistogram contains inclusive sum + if (index < RADIX) { + uint v = index == 0 ? 0 : localHistogram[waveCount * index - 1]; + localHistogramSum[index] = globalHistogram[RADIX * pass + index] + + partitionHistogram[RADIX * partitionIndex + index] - v; + } + GroupMemoryBarrierWithGroupSync(); + + // rearrange keys. grouping keys together makes dstOffset to be almost sequential, grants huge + // speed boost. now localHistogram is unused, so alias memory. + [ForceUnroll] + for (int i = 0; i < PARTITION_DIVISION; ++i) { + localHistogram[localOffsets[i]] = localKeys[i]; + } + GroupMemoryBarrierWithGroupSync(); + + // binning + for (uint i = index; i < PARTITION_SIZE; i += WORKGROUP_SIZE) { + uint key = localHistogram[i]; + uint radix = bitfieldExtract(key, pass * 8, 8); + uint dstOffset = localHistogramSum[radix] + i; + if (dstOffset < elementCount) { + keysOut[dstOffset] = key; + } + +#if KEY_VALUE + localKeys[i / WORKGROUP_SIZE] = dstOffset; +#endif + } + +#if KEY_VALUE + GroupMemoryBarrierWithGroupSync(); + + [ForceUnroll] + for (int i = 0; i < PARTITION_DIVISION; ++i) { + localHistogram[localOffsets[i]] = localValues[i]; + } + GroupMemoryBarrierWithGroupSync(); + + for (uint i = index; i < PARTITION_SIZE; i += WORKGROUP_SIZE) { + uint value = localHistogram[i]; + uint dstOffset = localKeys[i / WORKGROUP_SIZE]; + if (dstOffset < elementCount) { + valuesOut[dstOffset] = value; + } + } +#endif +} diff --git a/Source/Shaders/Shaders/Sorting/Radix/Spine.cs.slang b/Source/Shaders/Shaders/Sorting/Radix/Spine.cs.slang new file mode 100644 index 00000000..5ac0bd3b --- /dev/null +++ b/Source/Shaders/Shaders/Sorting/Radix/Spine.cs.slang @@ -0,0 +1,97 @@ +// Reduce-then-scan radix sort -- spine pass. +// For each radix bin, prefix-scans the per-partition histograms so every partition knows its +// starting offset for every bin. Also prefix-scans the per-pass global histogram (bin 0's +// workgroup handles that). +// Dispatched with `(RADIX=256, 1, 1)` groups, WORKGROUP_SIZE threads each. + +#include "Sorting/Radix/Constants.inc.slang" + +struct RadixPushConstants +{ + uint pass; + uint elementCount; +}; +[[vk::push_constant]] RadixPushConstants _radixPC; + +[[vk::binding(1, PER_PASS)]] RWStructuredBuffer globalHistogram; +[[vk::binding(2, PER_PASS)]] RWStructuredBuffer partitionHistogram; + +groupshared uint reduction; +groupshared uint intermediate[MAX_SUBGROUP_SIZE]; + +[shader("compute")] +[numthreads(WORKGROUP_SIZE)] +void main(uint3 groupThreadID: SV_GroupThreadID, uint3 groupId: SV_GroupID, + uint groupIndex: SV_GroupIndex) +{ + const uint pass = _radixPC.pass; + const uint elementCount = _radixPC.elementCount; + + uint laneIndex = WaveGetLaneIndex(); // 0..31 + uint laneCount = WaveGetLaneCount(); // 32 + uint waveIndex = groupIndex / laneCount; + uint waveCount = WORKGROUP_SIZE / laneCount; + uint index = waveIndex * laneCount + laneIndex; + + uint radix = groupId.x; + + uint partitionCount = (elementCount + PARTITION_SIZE - 1) / PARTITION_SIZE; + + if (index == 0) { + reduction = 0; + } + GroupMemoryBarrierWithGroupSync(); + + for (uint i = 0; WORKGROUP_SIZE * i < partitionCount; ++i) { + uint partitionIndex = WORKGROUP_SIZE * i + index; + uint value = + partitionIndex < partitionCount ? partitionHistogram[RADIX * partitionIndex + radix] : 0; + uint excl = WavePrefixSum(value) + reduction; + uint sum = WaveActiveSum(value); + + if (WaveIsFirstLane()) { + intermediate[waveIndex] = sum; + } + GroupMemoryBarrierWithGroupSync(); + + if (index < waveCount) { + uint excl = WavePrefixSum(intermediate[index]); + uint sum = WaveActiveSum(intermediate[index]); + intermediate[index] = excl; + + if (index == 0) { + reduction += sum; + } + } + GroupMemoryBarrierWithGroupSync(); + + if (partitionIndex < partitionCount) { + excl += intermediate[waveIndex]; + partitionHistogram[RADIX * partitionIndex + radix] = excl; + } + GroupMemoryBarrierWithGroupSync(); + } + + if (radix == 0) { + // one workgroup is responsible for global histogram prefix sum + if (index < RADIX) { + uint value = globalHistogram[RADIX * pass + index]; + uint excl = WavePrefixSum(value); + uint sum = WaveActiveSum(value); + + if (WaveIsFirstLane()) { + intermediate[waveIndex] = sum; + } + GroupMemoryBarrierWithGroupSync(); + + if (index < RADIX / laneCount) { + uint excl = WavePrefixSum(intermediate[index]); + intermediate[index] = excl; + } + GroupMemoryBarrierWithGroupSync(); + + excl += intermediate[waveIndex]; + globalHistogram[RADIX * pass + index] = excl; + } + } +} diff --git a/Source/Shaders/Shaders/Sorting/Radix/Upsweep.cs.slang b/Source/Shaders/Shaders/Sorting/Radix/Upsweep.cs.slang new file mode 100644 index 00000000..ec38a4c5 --- /dev/null +++ b/Source/Shaders/Shaders/Sorting/Radix/Upsweep.cs.slang @@ -0,0 +1,59 @@ +// Reduce-then-scan radix sort -- upsweep pass. +// For each partition of PARTITION_SIZE keys, counts per-radix-bin histograms: +// - Writes the per-partition histogram to partitionHistogram[radix][partitionIndex] +// - Accumulates the per-pass global histogram via atomic add +// Dispatched with `(partitionCount, 1, 1)` groups, WORKGROUP_SIZE threads each. + +#include "Sorting/Radix/Constants.inc.slang" + +struct RadixPushConstants +{ + uint pass; // 0..3 for 4 passes of 8 bits over a u32 key + uint elementCount; // total valid keys (the last partition may be partial) +}; +[[vk::push_constant]] RadixPushConstants _radixPC; + +[[vk::binding(1, PER_PASS)]] RWStructuredBuffer globalHistogram; +[[vk::binding(2, PER_PASS)]] RWStructuredBuffer partitionHistogram; +[[vk::binding(3, PER_PASS)]] RWStructuredBuffer keys; + +groupshared uint localHistogram[RADIX]; + +[shader("compute")] +[numthreads(WORKGROUP_SIZE)] +void main(uint3 groupThreadID: SV_GroupThreadID, uint3 groupId: SV_GroupID) +{ + const uint pass = _radixPC.pass; + const uint elementCount = _radixPC.elementCount; + + uint index = groupThreadID.x; + uint partitionIndex = groupId.x; + uint partitionStart = partitionIndex * PARTITION_SIZE; + + // discard all workgroup invocations + if (partitionStart >= elementCount) { + return; + } + + if (index < RADIX) { + localHistogram[index] = 0; + } + GroupMemoryBarrierWithGroupSync(); + + // local histogram + for (int i = 0; i < PARTITION_DIVISION; ++i) { + uint keyIndex = partitionStart + WORKGROUP_SIZE * i + index; + uint key = keyIndex < elementCount ? keys[keyIndex] : 0xffffffff; + uint radix = bitfieldExtract(key, 8 * pass, 8); + __atomic_add(localHistogram[radix], 1, MemoryOrder.Relaxed); + } + GroupMemoryBarrierWithGroupSync(); + + if (index < RADIX) { + // set to partition histogram + partitionHistogram[RADIX * partitionIndex + index] = localHistogram[index]; + + // add to global histogram + __atomic_add(globalHistogram[RADIX * pass + index], localHistogram[index], MemoryOrder.Relaxed); + } +} diff --git a/Source/Shaders/Shaders/Sorting/SortCount.cs.slang b/Source/Shaders/Shaders/Sorting/SortCount.cs.slang deleted file mode 100644 index a2ae2ce1..00000000 --- a/Source/Shaders/Shaders/Sorting/SortCount.cs.slang +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -//-------------------------------------------------------------------------------------- -// ParallelSort Shaders/Includes -//-------------------------------------------------------------------------------------- -#include "Sorting/FFX_ParallelSort.inc.slang" - -struct CountCB -{ - uint shiftBits; -}; - -[[vk::push_constant]] CountCB _countCB; // Count Indirect Constant buffer - -[[vk::binding(0, PER_PASS)]] ConstantBuffer _constants; // Constant Buffer -[[vk::binding(1, PER_PASS)]] RWStructuredBuffer _keys; // The unsorted keys or scan data -[[vk::binding(2, PER_PASS)]] RWStructuredBuffer _sumTable; // The sum table we will write sums to - -[shader("compute")] -[numthreads(FFX_PARALLELSORT_THREADGROUP_SIZE, 1, 1)] -void main(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) -{ - // Call the uint version of the count part of the algorithm - FFX_ParallelSort_Count_uint(localID, groupID, _constants, _countCB.shiftBits, _keys, _sumTable); -} diff --git a/Source/Shaders/Shaders/Sorting/SortCountReduce.cs.slang b/Source/Shaders/Shaders/Sorting/SortCountReduce.cs.slang deleted file mode 100644 index ff5e797e..00000000 --- a/Source/Shaders/Shaders/Sorting/SortCountReduce.cs.slang +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -//-------------------------------------------------------------------------------------- -// ParallelSort Shaders/Includes -//-------------------------------------------------------------------------------------- -#include "Sorting/FFX_ParallelSort.inc.slang" - -[[vk::binding(0, PER_PASS)]] ConstantBuffer _constants; // Constant Buffer -[[vk::binding(1, PER_PASS)]] RWStructuredBuffer _sumTable; // The sum table we will write sums to -[[vk::binding(2, PER_PASS)]] RWStructuredBuffer _reducedSumTable; // The reduced sum table we will write sums to - -[shader("compute")] -[numthreads(FFX_PARALLELSORT_THREADGROUP_SIZE, 1, 1)] -void main(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) -{ - // Call the reduce part of the algorithm - FFX_ParallelSort_ReduceCount(localID, groupID, _constants, _sumTable, _reducedSumTable); -} diff --git a/Source/Shaders/Shaders/Sorting/SortScan.cs.slang b/Source/Shaders/Shaders/Sorting/SortScan.cs.slang deleted file mode 100644 index baf962e8..00000000 --- a/Source/Shaders/Shaders/Sorting/SortScan.cs.slang +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -//-------------------------------------------------------------------------------------- -// ParallelSort Shaders/Includes -//-------------------------------------------------------------------------------------- -#include "Sorting/FFX_ParallelSort.inc.slang" - -[[vk::binding(0, PER_PASS)]] ConstantBuffer _constants; // Constant Buffer -[[vk::binding(1, PER_PASS)]] RWStructuredBuffer _scanSrc; // Source for Scan Data -[[vk::binding(2, PER_PASS)]] RWStructuredBuffer _scanDst; // Destination for Scan Data -[[vk::binding(3, PER_PASS)]] RWStructuredBuffer _scanScratch; // Scratch data for Scan - -[shader("compute")] -[numthreads(FFX_PARALLELSORT_THREADGROUP_SIZE, 1, 1)] -void main(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) -{ - uint baseIndex = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE * groupID; - FFX_ParallelSort_ScanPrefix(_constants.NumScanValues, localID, groupID, 0, baseIndex, false, - _constants, _scanSrc, _scanDst, _scanScratch); -} \ No newline at end of file diff --git a/Source/Shaders/Shaders/Sorting/SortScanAdd.cs.slang b/Source/Shaders/Shaders/Sorting/SortScanAdd.cs.slang deleted file mode 100644 index f0242849..00000000 --- a/Source/Shaders/Shaders/Sorting/SortScanAdd.cs.slang +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -//-------------------------------------------------------------------------------------- -// ParallelSort Shaders/Includes -//-------------------------------------------------------------------------------------- -#include "Sorting/FFX_ParallelSort.inc.slang" - -[[vk::binding(0, PER_PASS)]] ConstantBuffer _constants; // Constant Buffer -[[vk::binding(1, PER_PASS)]] RWStructuredBuffer _scanSrc; // Source for Scan Data -[[vk::binding(2, PER_PASS)]] RWStructuredBuffer _scanDst; // Destination for Scan Data -[[vk::binding(3, PER_PASS)]] RWStructuredBuffer _scanScratch; // Scratch data for Scan - -[shader("compute")] -[numthreads(FFX_PARALLELSORT_THREADGROUP_SIZE, 1, 1)] -void main(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) -{ - // When doing adds, we need to access data differently because reduce - // has a more specialized access pattern to match optimized count - // Access needs to be done similarly to reduce - // Figure out what bin data we are reducing - uint binID = groupID / _constants.NumReduceThreadgroupPerBin; - uint binOffset = binID * _constants.NumThreadGroups; - - // Get the base index for this thread group - //uint BaseIndex = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE * (groupID / FFX_PARALLELSORT_SORT_BIN_COUNT); - uint baseIndex = (groupID % _constants.NumReduceThreadgroupPerBin) * FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; - - FFX_ParallelSort_ScanPrefix(_constants.NumThreadGroups, localID, groupID, binOffset, baseIndex, true, - _constants, _scanSrc, _scanDst, _scanScratch); -} \ No newline at end of file diff --git a/Source/Shaders/Shaders/Sorting/SortScatter.cs.slang b/Source/Shaders/Shaders/Sorting/SortScatter.cs.slang deleted file mode 100644 index 97446ffe..00000000 --- a/Source/Shaders/Shaders/Sorting/SortScatter.cs.slang +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -//-------------------------------------------------------------------------------------- -// ParallelSort Shaders/Includes -//-------------------------------------------------------------------------------------- -#include "Sorting/FFX_ParallelSort.inc.slang" - -struct ScatterCB -{ - uint shiftBits; -}; - -[[vk::push_constant]] ScatterCB _scatterCB; // Count Indirect Constant buffer - -[[vk::binding(0, PER_PASS)]] ConstantBuffer _constants; // Constant Buffer -[[vk::binding(1, PER_PASS)]] RWStructuredBuffer _keys; // The unsorted keys or scan data -[[vk::binding(2, PER_PASS)]] RWStructuredBuffer _values; // The payload data -[[vk::binding(3, PER_PASS)]] RWStructuredBuffer _sumTable; // The sum table we will write sums to -[[vk::binding(5, PER_PASS)]] RWStructuredBuffer _writeKeys; // The sorted keys or prefixed data -[[vk::binding(6, PER_PASS)]] RWStructuredBuffer _writeValues; // the sorted payload data - -[shader("compute")] -[numthreads(FFX_PARALLELSORT_THREADGROUP_SIZE, 1, 1)] -void main(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) -{ - FFX_ParallelSort_Scatter_uint(localID, groupID, _constants, _scatterCB.shiftBits, _keys, _writeKeys, _sumTable, _values, _writeValues); -} \ No newline at end of file diff --git a/Source/Shaders/Shaders/Sorting/SortSetupIndirectParameters.cs.slang b/Source/Shaders/Shaders/Sorting/SortSetupIndirectParameters.cs.slang deleted file mode 100644 index 302d46d3..00000000 --- a/Source/Shaders/Shaders/Sorting/SortSetupIndirectParameters.cs.slang +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -//-------------------------------------------------------------------------------------- -// ParallelSort Shaders/Includes -//-------------------------------------------------------------------------------------- -#include "Sorting/FFX_ParallelSort.inc.slang" - -struct SetupCB -{ - uint maxThreadGroups; -}; - -struct NumKeys -{ - uint numKeys; -}; - -[[vk::push_constant]] SetupCB _setupCB; // Setup Indirect Constant buffer - -[[vk::binding(0, PER_PASS)]] StructuredBuffer _numKeys; -[[vk::binding(1, PER_PASS)]] RWStructuredBuffer _constants; // UAV for constant buffer parameters for indirect execution -[[vk::binding(2, PER_PASS)]] RWStructuredBuffer _countScatterArgs; // Count and Scatter Args for indirect execution -[[vk::binding(3, PER_PASS)]] RWStructuredBuffer _reduceScanArgs; // Reduce and Scan Args for indirect execution - -[shader("compute")] -[numthreads(1, 1, 1)] -void main(uint localID : SV_GroupThreadID) -{ - FFX_ParallelSort_SetupIndirectParams(_numKeys[0].numKeys, _setupCB.maxThreadGroups, _constants, _countScatterArgs, _reduceScanArgs); -} \ No newline at end of file diff --git a/Source/Shaders/Shaders/UI/Text.ps.slang b/Source/Shaders/Shaders/UI/Text.ps.slang deleted file mode 100644 index 3fa5ac3a..00000000 --- a/Source/Shaders/Shaders/UI/Text.ps.slang +++ /dev/null @@ -1,112 +0,0 @@ - -#include "Include/Common.inc.slang" - -struct CharDrawData -{ - uint4 packed0; // x: textureIndex & charIndex, y: clipMaskTextureIndex, z: textColor, w: borderColor - float4 packed1; // x: borderSize, y: padding, zw: unitRangeXY - uint4 packed2; // x: clipRegionMinXY, y: clipRegionMaxXY, z: clipMaskRegionMinXY, w: clipMaskRegionMaxXY - int4 packed3; // x: worldPositionIndex, yzw: unused -}; -[[vk::binding(2, PER_PASS)]] StructuredBuffer _charDrawDatas; - -[[vk::binding(3, PER_PASS)]] SamplerState _sampler; -[[vk::binding(4, PER_PASS)]] Texture2D _fontTextures[4096]; -[[vk::binding(5, PER_PASS)]] Texture2D _textures[4096]; - -struct VertexOutput -{ - float4 position : SV_POSITION; - float4 uvAndScreenPos : TEXCOORD0; - uint charDrawDataID : TEXCOORD1; -}; - -float Median(float a, float b, float c) -{ - return max(min(a, b), min(max(a, b), c)); -} - -float ScreenPxRange(float2 uv, float2 unitRange) -{ - float2 screenTexSize = float2(1.0f, 1.0f) / fwidth(uv); - return max(0.5 * dot(unitRange, screenTexSize), 1.0); -} - -bool ShouldDiscard(float2 pos, float2 clipMin, float2 clipMax) -{ - // Check if the position is outside the clip rect - return pos.x < clipMin.x || pos.x > clipMax.x || pos.y < clipMin.y || pos.y > clipMax.y; -} - -[shader("fragment")] -float4 main(VertexOutput input) : SV_Target -{ - //return float4(1.0f, 0.0f, 0.0f, 0.3f); - CharDrawData drawData = _charDrawDatas[input.charDrawDataID]; - - float2 screenPos = input.uvAndScreenPos.zw; - float2 clipRegionMin = float2(f16tof32(drawData.packed2.x), f16tof32(drawData.packed2.x >> 16)); - float2 clipRegionMax = float2(f16tof32(drawData.packed2.y), f16tof32(drawData.packed2.y >> 16)); - if (ShouldDiscard(screenPos, clipRegionMin, clipRegionMax)) - { - //return float4(1.0f, 0.0f, 0.0f, 0.3f); - discard; - } - - uint textureIndex = drawData.packed0.x & 0xFFFF; - - uint packedTextColor = drawData.packed0.z; - uint packedBorderColor = drawData.packed0.w; - - float4 textColor = PackedUnormsToFloat4(packedTextColor); - float4 borderColor = PackedUnormsToFloat4(packedBorderColor); - - float borderSize = drawData.packed1.x; - float2 unitRange = drawData.packed1.zw; - - float4 distances = _fontTextures[textureIndex].Sample(_sampler, input.uvAndScreenPos.xy).rgba; - - const float roundedInlines = 0.0f; - const float roundedOutlines = 1.0f; - const float outBias = 1.0 / 4.0; - - float distMsdf = Median(distances.r, distances.g, distances.b); - float distSdf = distances.a; // mtsdf format only - distMsdf = min(distMsdf, distSdf + 0.1f); // HACK: to fix glitch in msdf near edges, see https://www.redblobgames.com/x/2404-distance-field-effects/ - - // Blend between sharp and rounded corners - float distInner = lerp(distMsdf, distSdf, roundedInlines); - float distOuter = lerp(distMsdf, distSdf, roundedOutlines); - - // Typically 0.5 is the threshold, > 0.5 is inside, < 0.5 is outside - const float threshold = 0.5f; - float width = ScreenPxRange(input.uvAndScreenPos.xy, unitRange); - - float inner = width * (distInner - threshold) + 0.5f + outBias; - float outer = width * (distOuter - threshold) + 0.5f + outBias + borderSize; - - float innerOpacity = saturate(inner); - float4 innerColor = textColor; - float outerOpacity = saturate(outer); - float4 outerColor = float4(borderColor.rgb, 1.0f); - - float4 color = (innerColor * innerOpacity) + (outerColor * (outerOpacity - innerOpacity)); - - // Apply the clipMask - float2 clipMaskRegionMin = float2(f16tof32(drawData.packed2.z), f16tof32(drawData.packed2.z >> 16)); - float2 clipMaskRegionMax = float2(f16tof32(drawData.packed2.w), f16tof32(drawData.packed2.w >> 16)); - float2 maskUV = (screenPos - clipMaskRegionMin) / (clipMaskRegionMax - clipMaskRegionMin); - - uint clipMaskTextureIndex = drawData.packed0.y; - float clipMask = _textures[clipMaskTextureIndex].Sample(_sampler, maskUV).a; - if (clipMask < 0.5f) - { - discard; - } - color.a *= clipMask; - - // Multiply the color channels by alpha for pre-multiplied alpha output - color.rgb *= color.a; - - return saturate(color); -} \ No newline at end of file diff --git a/Source/Shaders/Shaders/UI/Text.vs.slang b/Source/Shaders/Shaders/UI/Text.vs.slang deleted file mode 100644 index dcd4a6d9..00000000 --- a/Source/Shaders/Shaders/UI/Text.vs.slang +++ /dev/null @@ -1,67 +0,0 @@ - -#include "DescriptorSet/Global.inc.slang" - -[[vk::binding(0, PER_PASS)]] StructuredBuffer _vertices; -[[vk::binding(1, PER_PASS)]] StructuredBuffer _widgetWorldPositions; - -struct CharDrawData -{ - uint4 packed0; // x: textureIndex & charIndex, y: clipMaskTextureIndex, z: textColor, w: borderColor - float4 packed1; // x: borderSize, y: padding, zw: unitRangeXY - uint4 packed2; // x: clipRegionMinXY, y: clipRegionMaxXY, z: clipMaskRegionMinXY, w: clipMaskRegionMaxXY - int4 packed3; // x: worldPositionIndex, yzw: unused -}; -[[vk::binding(2, PER_PASS)]] StructuredBuffer _charDrawDatas; - -struct VertexInput -{ - uint vertexID : SV_VulkanVertexID; - uint charDrawDataID : SV_VulkanInstanceID; -}; - -struct VertexOutput -{ - float4 position : SV_POSITION; - float4 uvAndScreenPos : TEXCOORD0; - uint charDrawDataID : TEXCOORD1; -}; - -[shader("vertex")] -VertexOutput main(VertexInput input) -{ - CharDrawData charDrawData = _charDrawDatas[input.charDrawDataID]; - - uint charIndex = charDrawData.packed0.x >> 16; - - uint vertexID = input.vertexID + (charIndex * 6); // 6 vertices per character - float4 vertex = _vertices[vertexID]; - - float2 position = vertex.xy; - float2 uv = vertex.zw; - - int worldPositionIndex = charDrawData.packed3.x; - float4 finalPos; - - if (worldPositionIndex >= 0) - { - float3 worldPos = _widgetWorldPositions[worldPositionIndex].xyz; - - // Transform the world position to clip space. - float4 clipPos = mul(float4(worldPos, 1.0), _cameras[0].worldToClip); - clipPos.xyz /= clipPos.w; // Perform perspective division. - - finalPos = float4(clipPos.xy + position, 0.0, 1.0); - } - else - { - finalPos = float4(position, 0.0, 1.0); - } - - VertexOutput output; - output.position = finalPos; - float2 screenPos = (finalPos.xy + 1.0f) * 0.5f; - output.uvAndScreenPos = float4(uv, screenPos); - output.charDrawDataID = input.charDrawDataID; - - return output; -} \ No newline at end of file diff --git a/Source/Shaders/Shaders/UI/Panel.ps.slang b/Source/Shaders/Shaders/UI/Widget.ps.slang similarity index 50% rename from Source/Shaders/Shaders/UI/Panel.ps.slang rename to Source/Shaders/Shaders/UI/Widget.ps.slang index 454a2766..bc648d90 100644 --- a/Source/Shaders/Shaders/UI/Panel.ps.slang +++ b/Source/Shaders/Shaders/UI/Widget.ps.slang @@ -1,56 +1,57 @@ #include "Include/Common.inc.slang" -struct PanelDrawData +#define WIDGET_TYPE_PANEL 0u +#define WIDGET_TYPE_TEXT 1u + +struct WidgetDrawData { - uint4 packed0; // x: textureIndex & additiveTextureIndex, y: clipMaskTextureIndex, z: color, w: textureScaleToWidgetSizeXY - float4 texCoord; - float4 slicingCoord; - float4 cornerRadiusAndBorder; // xy: cornerRadius, zw: border - uint4 packed1; // x: clipRegionMinXY, y: clipRegionMaxXY, z: clipMaskRegionMinXY, w: clipMaskRegionMaxXY - int4 packed2; // x: worldPositionIndex, y: half2 anchorPos, z: half2 relativePos + uint4 packed0; // x: type, y: vertexBase, z: clipMaskTextureIndex, w: worldPositionIndex (int reinterpret) + uint4 packed1; // Panel: x: textureIndex & additiveTextureIndex, z: color, w: textureScaleToWidgetSize (half2). Text: x: fontTextureIndex, z: textColor, w: borderColor + float4 texCoord; // Panel only + float4 slicingCoord; // Panel only + float4 cornerRadiusAndBorder; // Panel: xy: cornerRadius. Text: x: borderSize, zw: unitRange + uint4 packed2; // x: clipRegionMinXY, y: clipRegionMaxXY, z: clipMaskRegionMinXY, w: clipMaskRegionMaxXY }; -[[vk::binding(2, PER_PASS)]] StructuredBuffer _panelDrawDatas; +[[vk::binding(2, PER_PASS)]] StructuredBuffer _widgetDrawDatas; [[vk::binding(3, PER_PASS)]] SamplerState _sampler; [[vk::binding(4, PER_PASS)]] Texture2D _textures[4096]; +[[vk::binding(5, PER_PASS)]] Texture2D _fontTextures[4096]; float NineSliceAxis(float coord, float pixelSizeUV, float texCoordMin, float texCoordMax, float borderSizeMin, float borderSizeMax) { - /* Original Code - float scaledBorderMin = texCoordMin + (borderSizeMin * pixelSizeUV); - if (coord < scaledBorderMin) // Min - return Map(coord, texCoordMin, scaledBorderMin, texCoordMin, texCoordMin + borderSizeMin); - - float scaledBorderMax = texCoordMax - (borderSizeMax * pixelSizeUV); - if (coord < scaledBorderMax) // Center - return Map(coord, scaledBorderMin, scaledBorderMax, texCoordMin + borderSizeMin, texCoordMax - borderSizeMax); - - // Max - return Map(coord, scaledBorderMax, texCoordMax, texCoordMax - borderSizeMax, texCoordMax); - */ - // Branchless Version float scaledBorderMin = texCoordMin + (borderSizeMin * pixelSizeUV); float scaledBorderMax = texCoordMax - (borderSizeMax * pixelSizeUV); - + bool isBorderMin = coord < scaledBorderMin; bool isCenter = !isBorderMin && coord < scaledBorderMax; bool isBorderMax = !isBorderMin && !isCenter; - + float originalMin = (texCoordMin * isBorderMin) + (scaledBorderMin * isCenter) + (scaledBorderMax * isBorderMax); float originalMax = (scaledBorderMin * isBorderMin) + (scaledBorderMax * isCenter) + (texCoordMax * isBorderMax); float newMin = (texCoordMin * isBorderMin) + ((texCoordMin + borderSizeMin) * isCenter) + ((texCoordMax - borderSizeMax) * isBorderMax); float newMax = ((texCoordMin + borderSizeMin) * isBorderMin) + ((texCoordMax - borderSizeMax) * isCenter) + (texCoordMax * isBorderMax); - + return Map(coord, originalMin, originalMax, newMin, newMax); } bool ShouldDiscard(float2 pos, float2 clipMin, float2 clipMax) { - // Check if the position is outside the clip rect return pos.x < clipMin.x || pos.x > clipMax.x || pos.y < clipMin.y || pos.y > clipMax.y; } +float Median(float a, float b, float c) +{ + return max(min(a, b), min(max(a, b), c)); +} + +float ScreenPxRange(float2 uv, float2 unitRange) +{ + float2 screenTexSize = float2(1.0f, 1.0f) / fwidth(uv); + return max(0.5 * dot(unitRange, screenTexSize), 1.0); +} + struct VertexOutput { float4 position : SV_POSITION; @@ -58,20 +59,8 @@ struct VertexOutput nointerpolation uint drawDataID : TEXCOORD1; }; -[shader("fragment")] -float4 main(VertexOutput input) : SV_Target +float4 ShadePanel(WidgetDrawData drawData, VertexOutput input) { - PanelDrawData drawData = _panelDrawDatas[input.drawDataID]; - - float2 screenPos = input.uvAndScreenPos.zw; - float2 clipRegionMin = float2(f16tof32(drawData.packed1.x), f16tof32(drawData.packed1.x >> 16)); - float2 clipRegionMax = float2(f16tof32(drawData.packed1.y), f16tof32(drawData.packed1.y >> 16)); - if (ShouldDiscard(screenPos, clipRegionMin, clipRegionMax)) - { - //return float4(1, 0, 0, 0.3f); - discard; - } - float2 uv = input.uvAndScreenPos.xy; float2 texCoordMin = drawData.texCoord.xy; float2 texCoordMax = drawData.texCoord.zw; @@ -80,8 +69,8 @@ float4 main(VertexOutput input) : SV_Target float2 borderSizeLeftTop = slicingCoordMin - texCoordMin; float2 borderSizeRightBottom = texCoordMax - slicingCoordMax; - - uint packedTextureScaleToWidgetSize = drawData.packed0.w; + + uint packedTextureScaleToWidgetSize = drawData.packed1.w; float2 scale = float2(f16tof32(packedTextureScaleToWidgetSize), f16tof32(packedTextureScaleToWidgetSize >> 16)); float2 scaledUV = float2( @@ -89,41 +78,34 @@ float4 main(VertexOutput input) : SV_Target NineSliceAxis(input.uvAndScreenPos.y, scale.y, texCoordMin.y, texCoordMax.y, borderSizeLeftTop.y, borderSizeRightBottom.y) ); - uint textureIndex = drawData.packed0.x & 0xFFFF; - uint additiveTextureIndex = drawData.packed0.x >> 16; - uint packedColor = drawData.packed0.z; - + uint textureIndex = drawData.packed1.x & 0xFFFF; + uint additiveTextureIndex = drawData.packed1.x >> 16; + uint packedColor = drawData.packed1.z; + float4 colorMultiplier = PackedUnormsToFloat4(packedColor); float4 color = _textures[textureIndex].Sample(_sampler, scaledUV); color *= colorMultiplier; float4 additiveColor = _textures[additiveTextureIndex].Sample(_sampler, scaledUV); - float additiveIntensity = dot(additiveColor.rgb, float3(0.299, 0.587, 0.114)) * 2.5f; // Constants from https://en.wikipedia.org/wiki/Grayscale#Luma_coding_in_video_systems + float additiveIntensity = dot(additiveColor.rgb, float3(0.299, 0.587, 0.114)) * 2.5f; additiveIntensity = saturate(additiveIntensity); - // Add the additive color to the base color color.rgb += additiveColor.rgb; - - // Blend in the intensity color.a = max(color.a, additiveIntensity); - float2 cornerRadius = drawData.cornerRadiusAndBorder.xy; // Specified in UV space + float2 cornerRadius = drawData.cornerRadiusAndBorder.xy; // Calculate distance to nearest edge float2 edgeDist = min(uv, 1.0 - uv); - // Check if cornerRadius is greater than zero if (cornerRadius.x > 0 && cornerRadius.y > 0) { - // Check if within the rounded corner area if (edgeDist.x < cornerRadius.x && edgeDist.y < cornerRadius.y) { - // Calculate distance from the corner using an elliptical formula float2 normalizedDist = 1.0 - ((edgeDist) / cornerRadius); float distToCorner = length(normalizedDist); - // Discard pixel if it's outside the rounded corner radius if (distToCorner > 1.0) { discard; @@ -131,12 +113,81 @@ float4 main(VertexOutput input) : SV_Target } } - // Apply the clipMask - float2 clipMaskRegionMin = float2(f16tof32(drawData.packed1.z), f16tof32(drawData.packed1.z >> 16)); - float2 clipMaskRegionMax = float2(f16tof32(drawData.packed1.w), f16tof32(drawData.packed1.w >> 16)); + return color; +} + +float4 ShadeText(WidgetDrawData drawData, VertexOutput input) +{ + uint fontTextureIndex = drawData.packed1.x & 0xFFFF; + + uint packedTextColor = drawData.packed1.z; + uint packedBorderColor = drawData.packed1.w; + + float4 textColor = PackedUnormsToFloat4(packedTextColor); + float4 borderColor = PackedUnormsToFloat4(packedBorderColor); + + float borderSize = drawData.cornerRadiusAndBorder.x; + float2 unitRange = drawData.cornerRadiusAndBorder.zw; + + float4 distances = _fontTextures[fontTextureIndex].Sample(_sampler, input.uvAndScreenPos.xy).rgba; + + const float roundedInlines = 0.0f; + const float roundedOutlines = 1.0f; + const float outBias = 1.0 / 4.0; + + float distMsdf = Median(distances.r, distances.g, distances.b); + float distSdf = distances.a; // mtsdf format only + distMsdf = min(distMsdf, distSdf + 0.1f); // HACK: to fix glitch in msdf near edges, see https://www.redblobgames.com/x/2404-distance-field-effects/ + + // Blend between sharp and rounded corners + float distInner = lerp(distMsdf, distSdf, roundedInlines); + float distOuter = lerp(distMsdf, distSdf, roundedOutlines); + + const float threshold = 0.5f; + float width = ScreenPxRange(input.uvAndScreenPos.xy, unitRange); + + float inner = width * (distInner - threshold) + 0.5f + outBias; + float outer = width * (distOuter - threshold) + 0.5f + outBias + borderSize; + + float innerOpacity = saturate(inner); + float4 innerColor = textColor; + float outerOpacity = saturate(outer); + float4 outerColor = float4(borderColor.rgb, 1.0f); + + return (innerColor * innerOpacity) + (outerColor * (outerOpacity - innerOpacity)); +} + +[shader("fragment")] +float4 main(VertexOutput input) : SV_Target +{ + WidgetDrawData drawData = _widgetDrawDatas[input.drawDataID]; + + // Shared clip-region discard + float2 screenPos = input.uvAndScreenPos.zw; + float2 clipRegionMin = float2(f16tof32(drawData.packed2.x), f16tof32(drawData.packed2.x >> 16)); + float2 clipRegionMax = float2(f16tof32(drawData.packed2.y), f16tof32(drawData.packed2.y >> 16)); + if (ShouldDiscard(screenPos, clipRegionMin, clipRegionMax)) + { + discard; + } + + uint type = drawData.packed0.x; + float4 color; + if (type == WIDGET_TYPE_PANEL) + { + color = ShadePanel(drawData, input); + } + else // WIDGET_TYPE_TEXT + { + color = ShadeText(drawData, input); + } + + // Shared clipMask + float2 clipMaskRegionMin = float2(f16tof32(drawData.packed2.z), f16tof32(drawData.packed2.z >> 16)); + float2 clipMaskRegionMax = float2(f16tof32(drawData.packed2.w), f16tof32(drawData.packed2.w >> 16)); float2 maskUV = (screenPos - clipMaskRegionMin) / (clipMaskRegionMax - clipMaskRegionMin); - uint clipMaskTextureIndex = drawData.packed0.y; + uint clipMaskTextureIndex = drawData.packed0.z; float clipMask = _textures[clipMaskTextureIndex].Sample(_sampler, maskUV).a; if (clipMask < 0.5f) { @@ -148,4 +199,4 @@ float4 main(VertexOutput input) : SV_Target color.rgb *= color.a; return saturate(color); -} \ No newline at end of file +} diff --git a/Source/Shaders/Shaders/UI/Panel.vs.slang b/Source/Shaders/Shaders/UI/Widget.vs.slang similarity index 59% rename from Source/Shaders/Shaders/UI/Panel.vs.slang rename to Source/Shaders/Shaders/UI/Widget.vs.slang index c46a0b7f..ffb15488 100644 --- a/Source/Shaders/Shaders/UI/Panel.vs.slang +++ b/Source/Shaders/Shaders/UI/Widget.vs.slang @@ -4,16 +4,16 @@ [[vk::binding(0, PER_PASS)]] StructuredBuffer _vertices; [[vk::binding(1, PER_PASS)]] StructuredBuffer _widgetWorldPositions; -struct PanelDrawData +struct WidgetDrawData { - uint4 packed0; // x: textureIndex & additiveTextureIndex, y: clipMaskTextureIndex, z: color, w: textureScaleToWidgetSizeXY - float4 texCoord; - float4 slicingCoord; - float4 cornerRadiusAndBorder; // xy: cornerRadius, zw: border - uint4 packed1; // x: clipRegionMinXY, y: clipRegionMaxXY, z: clipMaskRegionMinXY, w: clipMaskRegionMaxXY - int4 packed2; // x: worldPositionIndex, yzw: unused + uint4 packed0; // x: type, y: vertexBase, z: clipMaskTextureIndex, w: worldPositionIndex (int reinterpret) + uint4 packed1; // Panel: x: textureIndex & additiveTextureIndex, z: color, w: textureScaleToWidgetSize (half2). Text: x: fontTextureIndex, z: textColor, w: borderColor + float4 texCoord; // Panel only + float4 slicingCoord; // Panel only + float4 cornerRadiusAndBorder; // Panel: xy: cornerRadius. Text: x: borderSize, zw: unitRange + uint4 packed2; // x: clipRegionMinXY, y: clipRegionMaxXY, z: clipMaskRegionMinXY, w: clipMaskRegionMaxXY }; -[[vk::binding(2, PER_PASS)]] StructuredBuffer _panelDrawDatas; +[[vk::binding(2, PER_PASS)]] StructuredBuffer _widgetDrawDatas; struct VertexInput { @@ -31,13 +31,15 @@ struct VertexOutput [shader("vertex")] VertexOutput main(VertexInput input) { - float4 vertex = _vertices[input.vertexID]; + WidgetDrawData drawData = _widgetDrawDatas[input.drawDataID]; + + uint vertexBase = drawData.packed0.y; + float4 vertex = _vertices[vertexBase + input.vertexID]; float2 position = vertex.xy; float2 uv = vertex.zw; - PanelDrawData drawData = _panelDrawDatas[input.drawDataID]; - int worldPositionIndex = drawData.packed2.x; + int worldPositionIndex = (int)drawData.packed0.w; float4 finalPos; if (worldPositionIndex >= 0) @@ -63,4 +65,4 @@ VertexOutput main(VertexInput input) output.drawDataID = input.drawDataID; return output; -} \ No newline at end of file +} diff --git a/Submodules/Engine b/Submodules/Engine index 67e495f3..644e9784 160000 --- a/Submodules/Engine +++ b/Submodules/Engine @@ -1 +1 @@ -Subproject commit 67e495f3d28e6a30826b706f82dab69c82de65b8 +Subproject commit 644e9784925f632dd9764b4cfd29c98f42aa0760