From 7e8dd97d412ffec5ba46dbfe88e402c3a6846ea2 Mon Sep 17 00:00:00 2001 From: KaivalyaD Date: Mon, 2 Feb 2026 18:43:43 +0530 Subject: [PATCH 1/8] Minimal changes required for linux build, can't run yet due to segfault --- CMakeLists.txt | 1 + include/rtm/aabb.hpp | 1 + include/rtm/bvh.hpp | 2 +- include/rtm/float.hpp | 9 ++++++--- include/rtm/{HECWBVH.HPP => hecwbvh.hpp} | 2 +- src/arches-v2/CMakeLists.txt | 8 ++++++-- src/arches-v2/isa/riscv.cpp | 7 ++++--- src/arches-v2/main.cpp | 1 + src/arches-v2/shared-utils.hpp | 14 ++++++++++---- src/arches-v2/stdafx.hpp | 8 +++++--- src/arches-v2/trax.hpp | 11 +++++++---- .../units/dual-streaming/unit-stream-scheduler.cpp | 2 +- .../units/dual-streaming/unit-treelet-rt-core.cpp | 4 ++-- src/arches-v2/units/trax/unit-prt-core.cpp | 8 ++++---- src/arches-v2/units/unit-dram.cpp | 2 +- src/arches-v2/units/unit-dram.hpp | 2 +- src/arches-v2/units/unit-stream-cache.hpp | 11 +++++++---- src/arches-v2/units/usimm/usimm.cc | 12 +++++++----- src/arches-v2/util/alignment-allocator.hpp | 6 ++++-- src/arches-v2/util/bit-manipulation.hpp | 6 ++++-- src/arches-v2/util/file.cpp | 1 + src/arches-v2/util/stbi.cpp | 2 +- src/dual-streaming-kernel/CMakeLists.txt | 3 +++ src/strata-rt-kernel/CMakeLists.txt | 3 +++ src/trax-kernel/CMakeLists.txt | 3 +++ 25 files changed, 85 insertions(+), 44 deletions(-) rename include/rtm/{HECWBVH.HPP => hecwbvh.hpp} (99%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1989187..f1e772a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,7 @@ project(arches-v2 LANGUAGES CXX) set_property(GLOBAL PROPERTY USE_FOLDERS ON) set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) add_subdirectory(external) diff --git a/include/rtm/aabb.hpp b/include/rtm/aabb.hpp index d419928..c1b93d2 100644 --- a/include/rtm/aabb.hpp +++ b/include/rtm/aabb.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include "float.hpp" #include "int.hpp" #include "vec3.hpp" diff --git a/include/rtm/bvh.hpp b/include/rtm/bvh.hpp index 8dacddd..b93dc2d 100644 --- a/include/rtm/bvh.hpp +++ b/include/rtm/bvh.hpp @@ -168,7 +168,7 @@ class BVH2 for(uint i = start; i < end; ++i) { uint64_t mask = common_prefix ^ build_objects[i].morton_code; - common_prefix_size = std::min(common_prefix_size, _lzcnt_u64(mask)); + common_prefix_size = std::min(common_prefix_size, (uint64_t)_lzcnt_u64(mask)); } //All keys are identical. An arbitrary split diff --git a/include/rtm/float.hpp b/include/rtm/float.hpp index eb1f738..f04e288 100644 --- a/include/rtm/float.hpp +++ b/include/rtm/float.hpp @@ -5,7 +5,10 @@ #ifndef __riscv #include -#include +// #include +#include +extern __m128 _mm_cos_ps(__m128 __A); // Defined but not declared in immintrin.h +extern __m128 _mm_sin_ps(__m128 __A); // Defined but not declared in immintrin.h #include #endif @@ -140,7 +143,7 @@ inline int32_t f32_to_i24(float f32, uint8_t max_exp = 127, int rounding = 0) else if(rounding == 1) norm = std::ceil(norm); - if(norm > ((1 << 23) - 1) || norm < -(1 << 23)) __debugbreak(); + // if(norm > ((1 << 23) - 1) || norm < -(1 << 23)) __debugbreak(); return (int32_t)norm; } @@ -168,7 +171,7 @@ inline uint16_t f32_to_i16(float f32, uint8_t max_exp = 127, int rounding = 0) else if(rounding == 1) norm = std::ceil(norm); - if(norm > ((1 << 15) - 1) || norm < -(1 << 15)) __debugbreak(); + // if(norm > ((1 << 15) - 1) || norm < -(1 << 15)) __debugbreak(); return (int16_t)norm; } #endif diff --git a/include/rtm/HECWBVH.HPP b/include/rtm/hecwbvh.hpp similarity index 99% rename from include/rtm/HECWBVH.HPP rename to include/rtm/hecwbvh.hpp index 316e0d6..e12db63 100644 --- a/include/rtm/HECWBVH.HPP +++ b/include/rtm/hecwbvh.hpp @@ -150,7 +150,7 @@ class HECWBVH if(prim_size / sizeof(Node) != WBVH::LEAF_RATIO) { printf("HE%dCWBVH%d: Warning incorrect leaf ratio!!!\n", Node::NQ, WIDTH); - __debugbreak(); + // __debugbreak(); } uint8_t max_exp = 0; diff --git a/src/arches-v2/CMakeLists.txt b/src/arches-v2/CMakeLists.txt index 977b40f..90125e7 100644 --- a/src/arches-v2/CMakeLists.txt +++ b/src/arches-v2/CMakeLists.txt @@ -1,6 +1,9 @@ cmake_minimum_required(VERSION 3.14) add_compile_definitions(UNICODE _UNICODE) +# Enable intrinsics like _lzcnt_u64 - find this out +add_compile_options(-march=native) + set(PROJECT_NAME "arches-v2") file(GLOB_RECURSE ALL_INCLUDE CONFIGURE_DEPENDS "*.hpp" "*.h") @@ -53,8 +56,9 @@ target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/src) set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT DISABLE) set_target_properties(${PROJECT_NAME} PROPERTIES OUTPUT_NAME ${PROJECT_NAME}) -target_link_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/libraries/tbb) -target_link_libraries(${PROJECT_NAME} PRIVATE tbb12.lib) +# target_link_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/libraries/tbb) +# target_link_libraries(${PROJECT_NAME} PRIVATE tbb12.lib) +target_link_libraries(${PROJECT_NAME} PRIVATE tbb) target_link_libraries(${PROJECT_NAME} PRIVATE Ramulator) #set_target_properties(${PROJECT_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG ${CMAKE_CURRENT_BINARY_DIR}) #set_target_properties(${PROJECT_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/src/arches-v2/isa/riscv.cpp b/src/arches-v2/isa/riscv.cpp index a9ae7fa..e4e7b82 100644 --- a/src/arches-v2/isa/riscv.cpp +++ b/src/arches-v2/isa/riscv.cpp @@ -1,6 +1,7 @@ #include "riscv.hpp" -#include +// #include +#include #include "errors.hpp" #include "util/bit-manipulation.hpp" @@ -203,7 +204,7 @@ InstructionInfo const isa_SYSTEM[2] = InstructionInfo(0b000000000001, "ebreak", InstrType::SYS, Encoding::I, RegFile::INT, EXEC_DECL { //break point - __debugbreak(); + // __debugbreak(); }), }; @@ -745,7 +746,7 @@ InstructionInfo const isa_OP_FSGNJ_FP[3] = //r.funct3 { float rs1 = unit->float_regs->registers[instr.r.rs1].f32; float rs2 = unit->float_regs->registers[instr.r.rs2].f32; - unit->float_regs->registers[instr.r.rd].f32 = copysignf(rs1, (signbit(rs1) != signbit(rs2)) ? -1.0f : 1.0f); + unit->float_regs->registers[instr.r.rd].f32 = copysignf(rs1, (std::signbit(rs1) != std::signbit(rs2)) ? -1.0f : 1.0f); }), }; diff --git a/src/arches-v2/main.cpp b/src/arches-v2/main.cpp index 69fa90a..3207ca6 100644 --- a/src/arches-v2/main.cpp +++ b/src/arches-v2/main.cpp @@ -10,6 +10,7 @@ int arches_verbosity = 1; int main(int argc, char* argv[]) { + Arches::set_full_exe_name(argv[0]); Arches::SimulationConfig sim_config(argc, argv); sim_config.print(); diff --git a/src/arches-v2/shared-utils.hpp b/src/arches-v2/shared-utils.hpp index b5bfec3..e286dd8 100644 --- a/src/arches-v2/shared-utils.hpp +++ b/src/arches-v2/shared-utils.hpp @@ -17,15 +17,21 @@ #include "isa/riscv.hpp" #include "rtm/rtm.hpp" -#include +// #include +#include +char full_exe_name[FILENAME_MAX]; namespace Arches { +void set_full_exe_name(const char *name) { + strcpy(full_exe_name, name); +} + std::string get_project_folder_path() { - CHAR path[MAX_PATH]; - GetModuleFileNameA(NULL, path, MAX_PATH); - std::string executable_path(path); + // CHAR path[MAX_PATH]; + // GetModuleFileNameA(NULL, path, MAX_PATH); + std::string executable_path(full_exe_name); return executable_path.substr(0, executable_path.rfind("build")); } diff --git a/src/arches-v2/stdafx.hpp b/src/arches-v2/stdafx.hpp index ed96b56..593090e 100644 --- a/src/arches-v2/stdafx.hpp +++ b/src/arches-v2/stdafx.hpp @@ -142,8 +142,9 @@ #include #include -#include -#include +// #include +// #include +#include @@ -151,7 +152,8 @@ #ifndef _DEBUG inline void _assert(bool x) { - if(!x) __debugbreak(); + // if(!x) __debugbreak(); + assert(x); } #else inline void _assert(bool x) diff --git a/src/arches-v2/trax.hpp b/src/arches-v2/trax.hpp index c4d3082..cdb9b29 100644 --- a/src/arches-v2/trax.hpp +++ b/src/arches-v2/trax.hpp @@ -132,8 +132,10 @@ static TRaXKernelArgs initilize_buffers(uint8_t* main_memory, paddr_t& heap_addr { std::string scene_name = sim_config.get_string("scene_name"); std::string project_folder = get_project_folder_path(); - std::string datasets_folder = project_folder + "datasets\\"; - std::string cache_folder = project_folder + "datasets\\cache\\"; + // std::string datasets_folder = project_folder + "datasets\\"; + // std::string cache_folder = project_folder + "datasets\\cache\\"; + std::string datasets_folder = project_folder + "datasets/"; + std::string cache_folder = project_folder + "datasets/cache/"; TRaXKernelArgs args; args.framebuffer_width = sim_config.get_int("framebuffer_width"); @@ -362,7 +364,8 @@ static void run_sim_trax(SimulationConfig& sim_config) //DRAM UnitDRAM::Configuration dram_config; - dram_config.config_path = project_folder_path + "build\\src\\arches-v2\\config-files\\gddr6_14000_config.yaml"; + // dram_config.config_path = project_folder_path + "build\\src\\arches-v2\\config-files\\gddr6_14000_config.yaml"; + dram_config.config_path = project_folder_path + "build/src/arches-v2/config-files/gddr6_14000_config.yaml"; dram_config.size = 1ull << 30; //1GB dram_config.clock_ratio = dram_clock / core_clock; dram_config.latency = 92; @@ -459,7 +462,7 @@ static void run_sim_trax(SimulationConfig& sim_config) UnitL1Cache::PowerConfig l1d_power_config; #endif - ELF elf(project_folder_path + "src\\trax-kernel\\riscv\\kernel"); + ELF elf(project_folder_path + "src/trax-kernel/riscv/kernel"); ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM0] = "FCHTHRD"; ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM1] = "BOXISECT"; diff --git a/src/arches-v2/units/dual-streaming/unit-stream-scheduler.cpp b/src/arches-v2/units/dual-streaming/unit-stream-scheduler.cpp index 9ba7a53..7017b3d 100644 --- a/src/arches-v2/units/dual-streaming/unit-stream-scheduler.cpp +++ b/src/arches-v2/units/dual-streaming/unit-stream-scheduler.cpp @@ -224,7 +224,7 @@ void UnitStreamScheduler::_update_scheduler() SegmentState& child_state = _scheduler.segment_state_map[child_id]; child_state.depth = last_segment_state.depth + 1; if(_scheduler.weight_scheme == 0) child_weights[i] = child_state.weight; // based on total weight - else if(_scheduler.weight_scheme == 1) child_weights[i] = child_state.weight / std::max(1ull, child_state.num_rays); // based on average ray weight + else if(_scheduler.weight_scheme == 1) child_weights[i] = child_state.weight / std::max((uint64_t)1, child_state.num_rays); // based on average ray weight else child_weights[i] = 0.0f; //falls back to order in memory child_state.scheduled_weight = child_weights[i]; } diff --git a/src/arches-v2/units/dual-streaming/unit-treelet-rt-core.cpp b/src/arches-v2/units/dual-streaming/unit-treelet-rt-core.cpp index ece423e..ec4f799 100644 --- a/src/arches-v2/units/dual-streaming/unit-treelet-rt-core.cpp +++ b/src/arches-v2/units/dual-streaming/unit-treelet-rt-core.cpp @@ -23,7 +23,7 @@ template bool UnitTreeletRTCore::_try_queue_node(uint ray_id, uint treelet_id, uint node_id) { paddr_t start = (paddr_t)&((TT*)_treelet_base_addr)[treelet_id].nodes[node_id]; - paddr_t end = start + sizeof(TT::Node); + paddr_t end = start + sizeof(typename TT::Node); RayState& ray_state = _ray_states[ray_id]; ray_state.buffer.address = start; @@ -169,7 +169,7 @@ void UnitTreeletRTCore::_read_returns() if(buffer.type == 0) { - if(buffer.bytes_filled == sizeof(TT::Node)) + if(buffer.bytes_filled == sizeof(typename TT::Node)) { ray_state.phase = RayState::Phase::NODE_ISECT; _node_isect_queue.push(ray_id); diff --git a/src/arches-v2/units/trax/unit-prt-core.cpp b/src/arches-v2/units/trax/unit-prt-core.cpp index 3468028..4b220b9 100644 --- a/src/arches-v2/units/trax/unit-prt-core.cpp +++ b/src/arches-v2/units/trax/unit-prt-core.cpp @@ -7,7 +7,7 @@ namespace TRaX { template -UnitPRTCore::UnitPRTCore(const Configuration& config) : +UnitPRTCore::UnitPRTCore(const Configuration& config) : _max_rays(config.max_rays), _cache_port(config.cache_port), _num_clients(config.num_clients), _node_base_addr(config.node_base_addr), _tri_base_addr(config.tri_base_addr), _cache(config.cache), _request_network(config.num_clients, 1), _return_network(1, config.num_clients), @@ -58,8 +58,8 @@ void UnitPRTCore::clock_fall() template bool UnitPRTCore::_try_queue_node(uint ray_id, uint node_id) { - paddr_t start = _node_base_addr + node_id * sizeof(NT::Node); - paddr_t end = start + sizeof(NT::Node); + paddr_t start = _node_base_addr + node_id * sizeof(typename NT::Node); + paddr_t end = start + sizeof(typename NT::Node); _assert(start < 0x1ull << 32); @@ -190,7 +190,7 @@ void UnitPRTCore::_read_returns() if(buffer.type == 0) { - if(buffer.bytes_filled == sizeof(NT::Node)) + if(buffer.bytes_filled == sizeof(typename NT::Node)) { ray_state.phase = RayState::Phase::NODE_ISECT; _node_isect_queue.push(ray_id); diff --git a/src/arches-v2/units/unit-dram.cpp b/src/arches-v2/units/unit-dram.cpp index b9ad756..01e52e2 100644 --- a/src/arches-v2/units/unit-dram.cpp +++ b/src/arches-v2/units/unit-dram.cpp @@ -1,6 +1,6 @@ #include "unit-dram.hpp" -#include "USIMM/usimm.h" +#include "usimm/usimm.h" namespace Arches { namespace Units { diff --git a/src/arches-v2/units/unit-dram.hpp b/src/arches-v2/units/unit-dram.hpp index 566402f..0d0c455 100644 --- a/src/arches-v2/units/unit-dram.hpp +++ b/src/arches-v2/units/unit-dram.hpp @@ -1,7 +1,7 @@ #pragma once #include "stdafx.hpp" -#include "USIMM/memory_controller.h" +#include "usimm/memory_controller.h" #include "unit-base.hpp" #include "unit-main-memory-base.hpp" diff --git a/src/arches-v2/units/unit-stream-cache.hpp b/src/arches-v2/units/unit-stream-cache.hpp index 1a39455..f1bee3c 100644 --- a/src/arches-v2/units/unit-stream-cache.hpp +++ b/src/arches-v2/units/unit-stream-cache.hpp @@ -36,10 +36,13 @@ class UnitStreamCache : public UnitCacheBase struct PowerConfig { //Energy is joules, power in watts - float tag_energy{0.0f}; - float read_energy{0.0f}; - float write_energy{0.0f}; - float leakage_power{0.0f}; + float tag_energy; + float read_energy; + float write_energy; + float leakage_power; + + PowerConfig() : + tag_energy(0.0f), read_energy(0.0f), write_energy(0.0f), leakage_power(0.0f) {} }; UnitStreamCache(Configuration config); diff --git a/src/arches-v2/units/usimm/usimm.cc b/src/arches-v2/units/usimm/usimm.cc index 202dfc2..167de0e 100644 --- a/src/arches-v2/units/usimm/usimm.cc +++ b/src/arches-v2/units/usimm/usimm.cc @@ -21,7 +21,7 @@ affected threads' ready cycles for the given register. #include "memory_controller.h" #include "scheduler.h" #include "params.h" -#include +// #include #define MAXTRACELINESIZE 64 @@ -205,10 +205,12 @@ int usimm_setup(char* config_filename, long long int *addr; long long int *instrpc; - TCHAR exePath[MAX_PATH]; - GetModuleFileName(NULL, exePath, MAX_PATH); - std::wstring fullPath(exePath); - std::wstring exeFolder = fullPath.substr(0, fullPath.find_last_of(L"\\") + 1); + // TCHAR exePath[MAX_PATH]; + // GetModuleFileName(NULL, exePath, MAX_PATH); + // std::wstring fullPath(exePath); + // std::wstring exeFolder = fullPath.substr(0, fullPath.find_last_of(L"\\") + 1); + std::string fullPath("build/src/arches-v2/arches-v2"); + std::string exeFolder = fullPath.substr(0, fullPath.find_last_of("/") + 1); std::string current_folder_path(exeFolder.begin(), exeFolder.end()); std::string abs_path = current_folder_path + config_filename; diff --git a/src/arches-v2/util/alignment-allocator.hpp b/src/arches-v2/util/alignment-allocator.hpp index 0a2a1ae..b9126cc 100644 --- a/src/arches-v2/util/alignment-allocator.hpp +++ b/src/arches-v2/util/alignment-allocator.hpp @@ -35,12 +35,14 @@ class AlignmentAllocator inline pointer allocate(size_type n) { - return (pointer)_aligned_malloc(n * sizeof(value_type), N); + // return (pointer)_aligned_malloc(n * sizeof(value_type), N); + return (pointer)aligned_alloc(N, n * sizeof(value_type)); } inline void deallocate(pointer p, size_type) { - _aligned_free(p); + // _aligned_free(p); + free(p); } inline void construct(pointer p, const value_type& wert) diff --git a/src/arches-v2/util/bit-manipulation.hpp b/src/arches-v2/util/bit-manipulation.hpp index 9792f69..912d1ad 100644 --- a/src/arches-v2/util/bit-manipulation.hpp +++ b/src/arches-v2/util/bit-manipulation.hpp @@ -34,12 +34,14 @@ inline uint clz(uint64_t mask) inline uint popcnt(uint64_t mask) { - return __popcnt64(mask); + // return __popcnt64(mask); + return _popcnt64(mask); } inline uint64_t rotr(uint64_t mask, uint n) { - return _rotr64(mask, n); + // return _rotr64(mask, n); + return _lrotr(mask, n); } inline uint64_t pdep(uint64_t data, uint64_t mask) diff --git a/src/arches-v2/util/file.cpp b/src/arches-v2/util/file.cpp index dc15a03..3bfa7d0 100644 --- a/src/arches-v2/util/file.cpp +++ b/src/arches-v2/util/file.cpp @@ -5,6 +5,7 @@ #include #include #else + #include #include #include #endif diff --git a/src/arches-v2/util/stbi.cpp b/src/arches-v2/util/stbi.cpp index 35594c4..36dfd2a 100644 --- a/src/arches-v2/util/stbi.cpp +++ b/src/arches-v2/util/stbi.cpp @@ -1,7 +1,7 @@ //stb image #define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_WRITE_IMPLEMENTATION -#define STBI_MSC_SECURE_CRT +// #define STBI_MSC_SECURE_CRT //stb image #include "stb_image.h" diff --git a/src/dual-streaming-kernel/CMakeLists.txt b/src/dual-streaming-kernel/CMakeLists.txt index 8e8885e..dcd9724 100644 --- a/src/dual-streaming-kernel/CMakeLists.txt +++ b/src/dual-streaming-kernel/CMakeLists.txt @@ -1,5 +1,8 @@ cmake_minimum_required(VERSION 3.14) +# Enable intrinsics like _lzcnt_u64 - find this out +add_compile_options(-march=native) + set(PROJECT_NAME "dual-streaming-kernel") file(GLOB FILES *.hpp *.cpp) diff --git a/src/strata-rt-kernel/CMakeLists.txt b/src/strata-rt-kernel/CMakeLists.txt index b956b4a..f5d0e53 100644 --- a/src/strata-rt-kernel/CMakeLists.txt +++ b/src/strata-rt-kernel/CMakeLists.txt @@ -1,5 +1,8 @@ cmake_minimum_required(VERSION 3.14) +# Enable intrinsics like _lzcnt_u64 - find this out +add_compile_options(-march=native) + set(PROJECT_NAME "strata-rt-kernel") file(GLOB FILES *.hpp *.cpp) diff --git a/src/trax-kernel/CMakeLists.txt b/src/trax-kernel/CMakeLists.txt index 38da35b..650590e 100644 --- a/src/trax-kernel/CMakeLists.txt +++ b/src/trax-kernel/CMakeLists.txt @@ -1,5 +1,8 @@ cmake_minimum_required(VERSION 3.14) +# Enable intrinsics like _lzcnt_u64 - find this out +add_compile_options(-march=native) + set(PROJECT_NAME "trax-kernel") file(GLOB FILES *.hpp *.cpp) From 7731c0f3df176fcbeb9a0ffc3234d46ab0d8a1bb Mon Sep 17 00:00:00 2001 From: KaivalyaD Date: Thu, 5 Feb 2026 06:05:39 +0530 Subject: [PATCH 2/8] Fixed dangling reference to local variable --- src/arches-v2/units/unit-sfu.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arches-v2/units/unit-sfu.hpp b/src/arches-v2/units/unit-sfu.hpp index d6fb97d..04c0f7d 100644 --- a/src/arches-v2/units/unit-sfu.hpp +++ b/src/arches-v2/units/unit-sfu.hpp @@ -54,7 +54,7 @@ class UnitSFU : public UnitBase return return_crossbar.peek(port_index); } - virtual const SFURequest& read_return(uint port_index) + virtual const SFURequest read_return(uint port_index) { return return_crossbar.read(port_index); } From acac330717de83e0b70851eba7b4d93933da796e Mon Sep 17 00:00:00 2001 From: KaivalyaD Date: Wed, 11 Mar 2026 23:42:52 +0530 Subject: [PATCH 3/8] Custom ISA without recompiling gcc --- .gitignore | 3 ++- scripts/log-parser.py | 30 +++++++++++++++++++++++++++--- src/trax-kernel/custom-instr.hpp | 6 ++++-- src/trax-kernel/intersect.hpp | 19 ++++++++++++++++--- src/trax-kernel/makefile | 4 +++- 5 files changed, 52 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 3a84d71..09f6839 100644 --- a/.gitignore +++ b/.gitignore @@ -7,8 +7,9 @@ # objs datasets -# build +# directories build +venv ## log info *_log.txt diff --git a/scripts/log-parser.py b/scripts/log-parser.py index 655289c..8ed6de0 100644 --- a/scripts/log-parser.py +++ b/scripts/log-parser.py @@ -39,6 +39,10 @@ def gauss(n=11,sigma=1): data[i].append(float(re.findall(r'[\d]*[.][\d]+', line)[0])) +print(f"Total Cycles found: {len(cycles)}") +for i in range(len(keys)): + print(f"Key '{keys[i]}' found {len(data[i])} times") + show_unsmoothed = True if show_unsmoothed: for i in range(len(keys)): @@ -49,13 +53,33 @@ def gauss(n=11,sigma=1): kernel = np.array(gauss(kernel_size, 5)) kernel = kernel / kernel.sum() +# for i in range(len(keys)): +# if len(data[i]) == len(cycles): +# plt.plot(cycles, np.convolve(data[i], kernel, mode='same'), colors[i], label=keys[i]) + for i in range(len(keys)): - if len(data[i]) == len(cycles): - plt.plot(cycles, np.convolve(data[i], kernel, mode='same'), colors[i], label=keys[i]) + # determine the shortest length to keep (x, y) in sync + min_len = min(len(data[i]), len(cycles)) + + if min_len > 0: + # slice both to min_len so (x, y) match perfectly + x = cycles[:min_len] + y = data[i][:min_len] + + # plot unsmoothed (faded) + plt.plot(x, y, colors[i], alpha=0.125) + + # plot smoothed (solid with label) + smoothed = np.convolve(y, kernel, mode='same') + plt.plot(x, smoothed, colors[i], label=keys[i]) + else: + print(f"Skipping {keys[i]} - no data matches found.") plt.legend(loc="upper right") plt.ylabel(ylabel) plt.xlabel('Time (cycles)') plt.yscale('log') plt.grid() -plt.show() \ No newline at end of file +plt.savefig("simulation_results.png") # save instead of show +print("Plot saved to simulation_results.png") +#plt.show() \ No newline at end of file diff --git a/src/trax-kernel/custom-instr.hpp b/src/trax-kernel/custom-instr.hpp index 455ad8e..1c1848a 100644 --- a/src/trax-kernel/custom-instr.hpp +++ b/src/trax-kernel/custom-instr.hpp @@ -9,7 +9,9 @@ uint32_t inline fchthrd() { #ifdef __riscv uint32_t value = 0; - asm volatile("fchthrd %0\n\t" : "=r" (value)); + // asm volatile("fchthrd %0\n\t" : "=r" (value)); + asm volatile(".insn i 0x0000b, 0, %0, x0, 0\n\t" : "=r" (value)); + return value; #else return _next_thread++; @@ -31,4 +33,4 @@ inline void ebreak() "ebreak\n\t" ); #endif -} \ No newline at end of file +} diff --git a/src/trax-kernel/intersect.hpp b/src/trax-kernel/intersect.hpp index a3044a2..0dc1968 100644 --- a/src/trax-kernel/intersect.hpp +++ b/src/trax-kernel/intersect.hpp @@ -22,7 +22,9 @@ inline void _traceray(uint id, const rtm::Ray& ray, rtm::Hit& hit) asm volatile ( - "traceray %0, %4, %12\t\n" + // "traceray %0, %4, %12\t\n" + // ".insn i 0x0b, 0, %0, %4, %12\t\n" + ".insn i 0x0b, 5, %0, %4, %12\t\n" : "=f" (dst0), "=f" (dst1), "=f" (dst2), "=f" (dst3) : "f" (src0), "f" (src1), "f" (src2), "f" (src3), "f" (src4), "f" (src5), "f" (src6), "f" (src7), "I" (FLAGS) ); @@ -58,7 +60,13 @@ inline float _intersect(const rtm::AABB& aabb, const rtm::Ray& ray, const rtm::v register float src13 asm("f13") = aabb.max.z; float t; - asm volatile ("boxisect %0" : "=f" (t) : "f" (src0), "f" (src1), "f" (src2), "f" (src3), "f" (src4), "f" (src5), "f" (src6), "f" (src7), "f" (src8), "f" (src9), "f" (src10), "f" (src11), "f" (src12), "f" (src13)); + asm volatile ( + /*"boxisect %0"*/ + // ".insn r 0x0b %0" + ".insn r 0x0b, 0, 0x04, %0, x0, x0" + : "=f" (t) + : "f" (src0), "f" (src1), "f" (src2), "f" (src3), "f" (src4), "f" (src5), "f" (src6), "f" (src7), "f" (src8), "f" (src9), "f" (src10), "f" (src11), "f" (src12), "f" (src13) + ); return t; #else @@ -93,7 +101,12 @@ inline bool _intersect(const rtm::Triangle& tri, const rtm::Ray& ray, rtm::Hit& register float dst2 asm("f19") = hit.bc.y; register float dst3 asm("f20") = *(float*)&hit.id; - asm volatile("triisect %0\n\t" : "+f" (dst0), "+f" (dst1), "+f" (dst2), "+f" (dst3) : "f" (src0), "f" (src1), "f" (src2), "f" (src3), "f" (src4), "f" (src5), "f" (src6), "f" (src7), "f" (src8), "f" (src9), "f" (src10), "f" (src11), "f" (src12), "f" (src13), "f" (src14), "f" (src15), "f" (src16)); + asm volatile( + /*"triisect %0\n\t"*/ + // ".insn r 0x0b %0\n\t" + ".insn r 0x0b, 0, 0x08, %0, x0, x0" + : "+f" (dst0), "+f" (dst1), "+f" (dst2), "+f" (dst3) : "f" (src0), "f" (src1), "f" (src2), "f" (src3), "f" (src4), "f" (src5), "f" (src6), "f" (src7), "f" (src8), "f" (src9), "f" (src10), "f" (src11), "f" (src12), "f" (src13), "f" (src14), "f" (src15), "f" (src16) + ); bool is_hit = dst0 < hit.t; float _dst3 = dst3; diff --git a/src/trax-kernel/makefile b/src/trax-kernel/makefile index 32096f1..afde1aa 100644 --- a/src/trax-kernel/makefile +++ b/src/trax-kernel/makefile @@ -2,10 +2,12 @@ make: riscv riscv-dissasm #-mno-relax riscv: ./* - riscv64-unknown-elf-g++ -mno-relax -nostartfiles -emain -Wstack-usage=512 -mabi=lp64f -Ofast -I ../../include ./main.cpp -o ./riscv/kernel + riscv64-unknown-elf-g++ -mno-relax -nostartfiles -emain -Wstack-usage=512 -mabi=lp64f -Ofast -I ../../include ./main.cpp -o ./kernel-gcc + #clang++-20 --target=riscv64-unknown-elf --gcc-toolchain=/opt/riscv --sysroot=/opt/riscv/riscv64-unknown-elf -march=rv64imaf -mabi=lp64f -mno-relax -nostartfiles -Wl,-e,main -Wframe-larger-than=512 -O3 -I ../../include ./main.cpp -o ./kernel-llvm riscv-dissasm: riscv riscv64-unknown-elf-objdump -d -x ./riscv/kernel > "./riscv/kernel.dump" + #riscv64-unknown-elf-objdump -d -x ./kernel-llvm > "./kernel-llvm.dump" clean: rm -f riscv/* \ No newline at end of file From 0380792815358c1cb96221b28625161a24f924e0 Mon Sep 17 00:00:00 2001 From: haydelj Date: Fri, 20 Mar 2026 15:32:27 -0600 Subject: [PATCH 4/8] Added texture unit --- include/rtm/texture.hpp | 47 +- src/arches-v2/dual-streaming.hpp | 752 --------------------------- src/arches-v2/main.cpp | 43 +- src/arches-v2/trax.hpp | 730 -------------------------- src/arches-v2/units/trax/unit-tp.hpp | 10 + src/arches-v2/units/unit-sfu.hpp | 2 +- src/arches-v2/units/unit-texture.cpp | 79 +++ src/arches-v2/units/unit-texture.hpp | 79 +++ src/trax-kernel/custom-instr.hpp | 33 +- src/trax-kernel/main.cpp | 8 +- src/trax-kernel/riscv/kernel | Bin 6952 -> 6336 bytes src/trax-kernel/riscv/kernel.dump | 502 +++++++----------- 12 files changed, 441 insertions(+), 1844 deletions(-) delete mode 100644 src/arches-v2/dual-streaming.hpp delete mode 100644 src/arches-v2/trax.hpp create mode 100644 src/arches-v2/units/unit-texture.cpp create mode 100644 src/arches-v2/units/unit-texture.hpp diff --git a/include/rtm/texture.hpp b/include/rtm/texture.hpp index 0e88a98..ed41627 100644 --- a/include/rtm/texture.hpp +++ b/include/rtm/texture.hpp @@ -73,46 +73,35 @@ class Texture2D } #endif - rtm::vec3 sample(const rtm::vec2& uv) const + rtm::vec4 sample(const rtm::vec2& uv) const { - if(width == 0 && height == 0) return rtm::vec3(0.0f); - if(width == 1 && height == 1) return read({0, 0}); - - rtm::vec2 _uv = rtm::mod(uv, rtm::vec2(1.0f)) * rtm::vec2(width, height); - //return read_nearest(_uv); - - //rtm::vec2 _uv = rtm::mod(uv, rtm::vec2(1.0f)) * rtm::vec2(width, height); - rtm::vec3 s00 = read_nearest(_uv + rtm::vec2(-0.5f, -0.5f)); - rtm::vec3 s10 = read_nearest(_uv + rtm::vec2(0.5f, -0.5f)); - rtm::vec3 s01 = read_nearest(_uv + rtm::vec2(-0.5f, 0.5f)); - rtm::vec3 s11 = read_nearest(_uv + rtm::vec2(0.5f, 0.5f)); - - rtm::vec2 ic = rtm::mod(_uv + rtm::vec2(0.5f), rtm::vec2(1.0f)); - rtm::vec3 s0 = rtm::mix(s00, s01, ic.y); - rtm::vec3 s1 = rtm::mix(s10, s11, ic.y); + if(width == 0 && height == 0) return rtm::vec4(0.0f); + return read_nearest(uv); + } - return rtm::mix(s0, s1, ic.x); + rtm::uvec2 get_iuv(const rtm::vec2& uv, const rtm::vec2& offset = rtm::vec2(0.0f)) const + { + rtm::vec2 fuv = uv * rtm::vec2(width, height) + offset; + return rtm::uvec2(fuv[0], fuv[1]); } -private: - rtm::vec3 read(const rtm::uvec2& iuv) const + Texel* get_texel_addr(const rtm::uvec2& iuv) const { - uint i = iuv[1] * width + iuv[0]; - return rtm::vec3(texels[i].channel[0], texels[i].channel[1], texels[i].channel[2]) * (1.0f / 255.0f); + uint32_t x = iuv[0] % width; + uint32_t y = iuv[1] % height; + return &texels[y * width + x]; } - rtm::uvec2 get_nearest(const rtm::vec2& uv) const + static rtm::vec4 decode_texel(const Texel& texel) { - rtm::uvec2 mv = rtm::uvec2(width, height); - rtm::uvec2 iuv = rtm::uvec2(uv[0], uv[1]) + mv; - iuv[0] = iuv[0] % mv[0]; iuv[1] = iuv[1] % mv[1]; - return iuv; + return rtm::vec4(texel.channel[0], texel.channel[1], texel.channel[2], texel.channel[3]) * (1.0f / 255.0f); } - rtm::vec3 read_nearest(const rtm::vec2& uv) const + rtm::vec4 read_nearest(const rtm::vec2& uv, rtm::vec2 offset = rtm::vec2(0.0f)) const { - rtm::vec3 v = read(get_nearest(uv)); - return v; + rtm::uvec2 iuv = get_iuv(uv, offset); + Texel* texel = get_texel_addr(iuv); + return decode_texel(*texel); } }; diff --git a/src/arches-v2/dual-streaming.hpp b/src/arches-v2/dual-streaming.hpp deleted file mode 100644 index a64e150..0000000 --- a/src/arches-v2/dual-streaming.hpp +++ /dev/null @@ -1,752 +0,0 @@ -#pragma once - -#include "shared-utils.hpp" -#include "units/dual-streaming/unit-treelet-rt-core.hpp" -#include "units/dual-streaming/unit-stream-scheduler.hpp" -#include "units/dual-streaming/unit-ray-staging-buffer.hpp" -#include "units/dual-streaming/unit-tp.hpp" -#include "units/dual-streaming/unit-hit-record-updater.hpp" -#include "units/dual-streaming/unit-scene-buffer.hpp" - -namespace Arches { - -namespace ISA { namespace RISCV { namespace DualStreaming { - -//see the opcode map for details -const static InstructionInfo isa_custom0_000_imm[8] = -{ - InstructionInfo(0x0, "fchthrd", InstrType::CUSTOM0, Encoding::U, RegFile::INT, MEM_REQ_DECL - { - MemoryRequest req; - req.type = MemoryRequest::Type::LOAD; - req.size = sizeof(uint32_t); - req.dst.push(DstReg(instr.rd, RegType::UINT32).u9, 9); - req.vaddr = 0x0ull; - - return req; - }), - InstructionInfo(0x1, "boxisect", InstrType::CUSTOM1, Encoding::U, RegFile::FLOAT, EXEC_DECL - { - Register32 * fr = unit->float_regs->registers; - - rtm::Ray ray; - rtm::vec3 inv_d; - ray.o.x = fr[0].f32; - ray.o.y = fr[1].f32; - ray.o.z = fr[2].f32; - ray.t_min = fr[3].f32; - inv_d.x = fr[4].f32; - inv_d.y = fr[5].f32; - inv_d.z = fr[6].f32; - ray.t_max = fr[7].f32; - - rtm::AABB aabb; - aabb.min.x = fr[8].f32; - aabb.min.y = fr[9].f32; - aabb.min.z = fr[10].f32; - aabb.max.x = fr[11].f32; - aabb.max.y = fr[12].f32; - aabb.max.z = fr[13].f32; - - unit->float_regs->registers[instr.u.rd].f32 = rtm::intersect(aabb, ray, inv_d); - }), - InstructionInfo(0x2, "triisect", InstrType::CUSTOM2, Encoding::U, RegFile::FLOAT, EXEC_DECL - { - Register32 * fr = unit->float_regs->registers; - - rtm::Ray ray; - ray.o.x = fr[0].f32; - ray.o.y = fr[1].f32; - ray.o.z = fr[2].f32; - ray.t_min = fr[3].f32; - ray.d.x = fr[4].f32; - ray.d.y = fr[5].f32; - ray.d.z = fr[6].f32; - ray.t_max = fr[7].f32; - - rtm::Triangle tri; - tri.vrts[0].x = fr[8].f32; - tri.vrts[0].y = fr[9].f32; - tri.vrts[0].z = fr[10].f32; - tri.vrts[1].x = fr[11].f32; - tri.vrts[1].y = fr[12].f32; - tri.vrts[1].z = fr[13].f32; - tri.vrts[2].x = fr[14].f32; - tri.vrts[2].y = fr[15].f32; - tri.vrts[2].z = fr[16].f32; - - rtm::Hit hit; - hit.t = fr[17].f32; - hit.bc[0] = fr[18].f32; - hit.bc[1] = fr[19].f32; - hit.id = fr[20].u32; - - rtm::intersect(tri, ray, hit); - - fr[17].f32 = hit.t; - fr[18].f32 = hit.bc[0]; - fr[19].f32 = hit.bc[1]; - fr[20].u32 = hit.id; - }), -}; - -const static InstructionInfo isa_custom0_funct3[8] = -{ - InstructionInfo(0x0, META_DECL{return isa_custom0_000_imm[instr.u.imm_31_12 >> 3]; }), - InstructionInfo(0x1, "lwi", InstrType::CUSTOM3, Encoding::I, RegFile::FLOAT, RegFile::INT, MEM_REQ_DECL - { - //load bucket ray into registers [rd - (rd + N)] - MemoryRequest mem_req; - mem_req.type = MemoryRequest::Type::LOAD; - mem_req.size = sizeof(WorkItem); - mem_req.dst.push(DstReg(instr.rd, RegType::FLOAT32).u9, 9); - mem_req.vaddr = unit->int_regs->registers[instr.i.rs1].u64 + i_imm(instr); - - return mem_req; - }), - InstructionInfo(0x2, "swi", InstrType::CUSTOM4, Encoding::S, RegFile::FLOAT, RegFile::INT, MEM_REQ_DECL - { - //store bucket ray to hit record updater - MemoryRequest mem_req; - mem_req.type = MemoryRequest::Type::STORE; - mem_req.size = sizeof(WorkItem); - mem_req.vaddr = unit->int_regs->registers[instr.s.rs1].u64 + s_imm(instr); - - Register32* fr = unit->float_regs->registers; - for(uint i = 0; i < sizeof(WorkItem) / sizeof(float); ++i) - ((float*)mem_req.data)[i] = fr[instr.s.rs2 + i].f32; - - return mem_req; - }), - InstructionInfo(0x3, "cshit", InstrType::CUSTOM5, Encoding::S, RegFile::FLOAT, RegFile::INT, MEM_REQ_DECL - { - MemoryRequest mem_req; - mem_req.type = MemoryRequest::Type::STORE; - mem_req.size = sizeof(rtm::Hit); - mem_req.vaddr = unit->int_regs->registers[instr.s.rs1].u64 + s_imm(instr); - - Register32* fr = unit->float_regs->registers; - for(uint i = 0; i < sizeof(rtm::Hit) / sizeof(float); ++i) - ((float*)mem_req.data)[i] = fr[instr.s.rs2 + i].f32; - - return mem_req; - }), - InstructionInfo(0x4, "lhit", InstrType::CUSTOM6, Encoding::I, RegFile::FLOAT, RegFile::INT, MEM_REQ_DECL - { - //load hit record into registers [rd - (rd + N)] - MemoryRequest mem_req; - mem_req.type = MemoryRequest::Type::LOAD; - mem_req.size = sizeof(rtm::Hit); - mem_req.dst.push(DstReg(instr.rd, RegType::FLOAT32).u9, 9); - mem_req.vaddr = unit->int_regs->registers[instr.i.rs1].u64 + i_imm(instr); - - return mem_req; - }), - InstructionInfo(0x5, "traceray", InstrType::CUSTOM7, Encoding::I, RegFile::FLOAT, MEM_REQ_DECL - { - //load hit record into registers [rd - (rd + N)] - MemoryRequest mem_req; - mem_req.type = MemoryRequest::Type::LOAD; - mem_req.size = sizeof(rtm::Hit); - mem_req.dst.push(DstReg(instr.rd, RegType::FLOAT32).u9, 9); - mem_req.vaddr = unit->int_regs->registers[instr.i.rs1].u64 + i_imm(instr); - - return mem_req; - }), -}; - -const static InstructionInfo custom0(CUSTOM_OPCODE0, META_DECL{return isa_custom0_funct3[instr.i.funct3];}); - -}}} - -namespace DualStreaming { - -class UnitL1Cache : public Units::UnitCache -{ -private: - std::pair _treelet_range; - -public: - UnitL1Cache(const UnitCache::Configuration& config, const std::pair& treelet_range) : - UnitCache(config), _treelet_range(treelet_range) - { - } - -private: - UnitMemoryBase* _get_mem_higher(paddr_t addr) override - { - if(addr >= _treelet_range.first && addr < _treelet_range.second) - return _mem_highers[1]; - - return _mem_highers[0]; - } -}; - -#include "dual-streaming-kernel/include.hpp" -#include "dual-streaming-kernel/intersect.hpp" - -typedef Units::UnitCache UnitL2Cache; -typedef Units::UnitDRAMRamulator UnitDRAM; -typedef rtm::CompressedWideTreeletBVH::Treelet SceneSegment; - -static DualStreamingKernelArgs initilize_buffers(Units::UnitMainMemoryBase* main_memory, paddr_t& heap_address, const SimulationConfig& sim_config, uint page_size) -{ - std::string scene_name = sim_config.get_string("scene_name"); - std::string project_folder = get_project_folder_path(); - std::string scene_file = project_folder + "datasets\\" + scene_name + ".obj"; - std::string bvh_cache_filename = project_folder + "datasets\\cache\\" + scene_name + ".bvh"; - - DualStreamingKernelArgs args; - args.framebuffer_width = sim_config.get_int("framebuffer_width"); - args.framebuffer_height = sim_config.get_int("framebuffer_height"); - args.framebuffer_size = args.framebuffer_width * args.framebuffer_height; - heap_address = align_to(page_size, heap_address); - args.framebuffer = reinterpret_cast(heap_address); - heap_address += args.framebuffer_size * sizeof(uint32_t); - - std::vector hits(args.framebuffer_size, {T_MAX, rtm::vec2(0.0), ~0u}); - args.hit_records = write_vector(main_memory, page_size, hits, heap_address); - - args.light_dir = rtm::normalize(rtm::vec3(4.5f, 42.5f, 5.0f)); - - args.camera = sim_config.camera; - - args.pregen_rays = sim_config.get_int("pregen_rays"); - args.use_early = sim_config.get_int("use_early"); - args.hit_delay = sim_config.get_int("hit_delay"); - - rtm::Mesh mesh(scene_file); - std::vector build_objects; - mesh.get_build_objects(build_objects); - - rtm::BVH bvh2(bvh_cache_filename, build_objects); - mesh.reorder(build_objects); - - std::vector rays(args.framebuffer_size); - if(args.pregen_rays) - pregen_rays(args.framebuffer_width, args.framebuffer_height, args.camera, bvh2, mesh, sim_config.get_int("pregen_bounce"), rays); - args.rays = write_vector(main_memory, CACHE_BLOCK_SIZE, rays, heap_address); - -#if DS_USE_COMPRESSED_WIDE_BVH - //rtm::WBVH wbvh(bvh2, build_objects, &mesh, false); - //mesh.reorder(build_objects); - - //rtm::NVCWBVH cwbvh(wbvh); - - //rtm::CompressedWideTreeletBVH cwtbvh(cwbvh, wbvh.ft_blocks.data()); - //args.treelets = write_vector(main_memory, page_size, cwtbvh.treelets, heap_address); - //args.treelet_headers = write_vector(main_memory, page_size, cwtbvh.treelet_headers, heap_address); - //args.num_treelets = cwtbvh.treelets.size(); -#else - rtm::WBVH wbvh(bvh2, build_objects); - mesh.reorder(build_objects); - - rtm::WideTreeletBVH wtbvh(wbvh, mesh); - args.treelets = write_vector(main_memory, page_size, wtbvh.treelets, heap_address); - args.num_treelets = wtbvh.treelets.size(); -#endif - - std::vector tris; - mesh.get_triangles(tris); - args.tris = write_vector(main_memory, CACHE_BLOCK_SIZE, tris, heap_address); - - main_memory->direct_write(&args, sizeof(DualStreamingKernelArgs), DS_KERNEL_ARGS_ADDRESS); - return args; -} - -static void run_sim_dual_streaming(const SimulationConfig& sim_config) -{ - std::string project_folder = get_project_folder_path(); - -#if 1 //Modern config - double clock_rate = 2.0e9; - uint num_threads = 8; - uint num_tps = 64; - uint num_tms = 64; - uint64_t stack_size = 512; - - //Memory - uint num_partitions = 16; - uint partition_stride = 1 << 13; - - //DRAM - UnitDRAM::Configuration dram_config; - dram_config.config_path = project_folder + "build\\src\\arches-v2\\config-files\\gddr6_pch_config.yaml"; - dram_config.size = 4ull << 30; //4GB - dram_config.num_controllers = num_partitions; - dram_config.partition_stride = partition_stride; - - //L2$ - UnitL2Cache::Configuration l2_config; - l2_config.level = 2; - l2_config.size = 4 << 20; - l2_config.associativity = 16; - l2_config.num_ports = num_tms; - l2_config.num_slices = num_partitions; - l2_config.num_banks = 2; - l2_config.crossbar_width = 64; - l2_config.num_mshr = 192; - l2_config.latency = 170; - - UnitL2Cache::PowerConfig l2_power_config; - l2_power_config.leakage_power = 184.55e-3f * l2_config.num_banks; - l2_power_config.tag_energy = 0.00756563e-9f; - l2_power_config.read_energy = 0.378808e-9f - l2_power_config.tag_energy; - l2_power_config.write_energy = 0.365393e-9f - l2_power_config.tag_energy; - - //L1d$ - UnitL1Cache::Configuration l1d_config; - l1d_config.level = 1; - l1d_config.size = 64 << 10; - l1d_config.associativity = 32; - l1d_config.num_banks = 4; - l1d_config.crossbar_width = 4; - l1d_config.num_mshr = 256; - l1d_config.latency = 20; - - UnitL1Cache::PowerConfig l1d_power_config; - l1d_power_config.leakage_power = 7.19746e-3f * l1d_config.num_banks * num_tms; - l1d_power_config.tag_energy = 0.000663943e-9f; - l1d_power_config.read_energy = 0.0310981e-9f - l1d_power_config.tag_energy; - l1d_power_config.write_energy = 0.031744e-9f - l1d_power_config.tag_energy; - - //Scene buffer - Units::DualStreaming::UnitSceneBuffer::Configuration scene_buffer_config; - scene_buffer_config.size = 4 * 1024 * 1024; // 4MB - scene_buffer_config.latency = 4; - scene_buffer_config.num_banks = 32; - scene_buffer_config.bank_select_mask = generate_nbit_mask(log2i(scene_buffer_config.num_banks)) << log2i(CACHE_BLOCK_SIZE); - - Units::DualStreaming::UnitSceneBuffer::PowerConfig scene_buffer_power_config; - scene_buffer_power_config.leakage_power = 53.7192e-3f * scene_buffer_config.num_banks; - scene_buffer_power_config.read_energy = 0.118977e-9f; - scene_buffer_power_config.write_energy = 0.118977e-9f; -#else //Legacy config - -#endif - - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM0] = "FCHTHRD"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM1] = "BOXISECT"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM2] = "TRIISECT"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM3] = "LWI"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM4] = "SWI"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM5] = "CSHIT"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM6] = "LHIT"; - ISA::RISCV::isa[ISA::RISCV::CUSTOM_OPCODE0] = ISA::RISCV::DualStreaming::custom0; - - uint num_sfus = static_cast(ISA::RISCV::InstrType::NUM_TYPES) * num_tms; - - Simulator simulator; - std::vector tps; - std::vector sfus; - std::vector thread_schedulers; - std::vector*> rtcs; - std::vector rsbs; - std::vector l1ds; - std::vector> unit_tables; unit_tables.reserve(num_tms); - std::vector> sfu_lists; sfu_lists.reserve(num_tms); - std::vector> mem_lists; mem_lists.reserve(num_tms); - - uint dram_ports_per_controller = 4; - dram_config.num_ports = dram_ports_per_controller * dram_config.num_controllers; - UnitDRAM dram(dram_config); - - simulator.register_unit(&dram); - simulator.new_unit_group(); - - Units::UnitBuffer::Configuration sram_config; - sram_config.latency = 1; - sram_config.size = 1 << 30; - sram_config.num_banks = num_partitions; - sram_config.num_ports = dram_ports_per_controller * num_partitions; - - Units::UnitBuffer sram(sram_config); - simulator.register_unit(&sram); - simulator.new_unit_group(); - - ELF elf(project_folder + "src\\dual-streaming-kernel\\riscv\\kernel"); - - dram.clear(); - paddr_t heap_address = elf.load(dram._data_u8); - DualStreamingKernelArgs kernel_args = DualStreaming::initilize_buffers(&dram, heap_address, sim_config, partition_stride); - heap_address = align_to(partition_stride * num_partitions, heap_address); - std::pair treelet_range = {(paddr_t)kernel_args.treelets, (paddr_t)kernel_args.treelets + kernel_args.num_treelets * sizeof(SceneSegment)}; - - std::set unused_dram_ports; - for(uint i = 0; i < dram_ports_per_controller; ++i) - unused_dram_ports.insert(i); - - l2_config.num_ports = num_tms; - l2_config.mem_highers = {&dram}; - l2_config.mem_higher_port = *unused_dram_ports.begin(); - l2_config.mem_higher_port_stride = dram_ports_per_controller; - unused_dram_ports.erase(*unused_dram_ports.begin()); - - UnitL2Cache l2(l2_config); - simulator.register_unit(&l2); - - Units::UnitAtomicRegfile atomic_regs(num_tms); - simulator.register_unit(&atomic_regs); - simulator.new_unit_group(); - - scene_buffer_config.segment_start = (paddr_t)kernel_args.treelets; - scene_buffer_config.segment_size = sizeof(SceneSegment); - scene_buffer_config.num_ports = num_tms; - scene_buffer_config.row_size = partition_stride; - scene_buffer_config.block_size = CACHE_BLOCK_SIZE; - scene_buffer_config.num_channels = num_partitions; - scene_buffer_config.main_mem = &dram; - scene_buffer_config.main_mem_port_offset = *unused_dram_ports.begin(); - scene_buffer_config.main_mem_port_stride = dram_ports_per_controller; - unused_dram_ports.erase(*unused_dram_ports.begin()); - - Units::DualStreaming::UnitSceneBuffer scene_buffer(scene_buffer_config); - simulator.register_unit(&scene_buffer); - - Units::DualStreaming::UnitStreamScheduler::Configuration stream_scheduler_config; - stream_scheduler_config.num_banks = 32; - stream_scheduler_config.num_channels = num_partitions; - stream_scheduler_config.traversal_scheme = sim_config.get_int("traversal_scheme"); - stream_scheduler_config.weight_scheme = sim_config.get_int("weight_scheme"); - stream_scheduler_config.num_tms = num_tms; - stream_scheduler_config.block_size = CACHE_BLOCK_SIZE; - stream_scheduler_config.row_size = partition_stride; - stream_scheduler_config.num_root_rays = kernel_args.framebuffer_size; - stream_scheduler_config.treelet_addr = *(paddr_t*)&kernel_args.treelets; - stream_scheduler_config.heap_addr = *(paddr_t*)&heap_address; - stream_scheduler_config.cheat_treelets = (rtm::WideTreeletBVH::Treelet::Header*)&dram._data_u8[(size_t)kernel_args.treelet_headers]; - stream_scheduler_config.main_mem = &dram; - stream_scheduler_config.main_mem_port_stride = dram_ports_per_controller; - stream_scheduler_config.main_mem_port_offset = *unused_dram_ports.begin(); - unused_dram_ports.erase(*unused_dram_ports.begin()); - - if(sim_config.get_int("use_scene_buffer")) - { - stream_scheduler_config.max_active_segments = scene_buffer_config.size / sizeof(SceneSegment); - stream_scheduler_config.scene_buffer = &scene_buffer; - } - else - { - stream_scheduler_config.l2_cache_port = l2_config.num_ports - 1; - stream_scheduler_config.l2_cache = nullptr; // &l2; - stream_scheduler_config.max_active_segments = num_tms * 2; - } - if(sim_config.get_int("rays_on_chip")) - { - stream_scheduler_config.main_mem = &sram; - } - - Units::DualStreaming::UnitStreamScheduler stream_scheduler(stream_scheduler_config); - simulator.register_unit(&stream_scheduler); - - Units::DualStreaming::UnitHitRecordUpdater::Configuration hit_record_updater_config; - hit_record_updater_config.num_tms = num_tms; - hit_record_updater_config.hit_record_start = *(paddr_t*)&kernel_args.hit_records; - hit_record_updater_config.cache_size = sim_config.get_int("hit_buffer_size"); // 128 * 16 = 2048B = 2KB - hit_record_updater_config.associativity = 8; - hit_record_updater_config.row_size = partition_stride; - hit_record_updater_config.num_channels = num_partitions; - hit_record_updater_config.main_mem = &dram; - hit_record_updater_config.main_mem_port_stride = dram_ports_per_controller; - hit_record_updater_config.main_mem_port_offset = *unused_dram_ports.begin(); - unused_dram_ports.erase(*unused_dram_ports.begin()); - - if(sim_config.get_int("hits_on_chip")) - { - std::vector hits(kernel_args.framebuffer_size); - for(auto& hit : hits) hit.t = T_MAX; - paddr_t address = *(paddr_t*)&kernel_args.hit_records; - write_vector(&sram, partition_stride, hits, address); - hit_record_updater_config.main_mem = &sram; - } - - Units::DualStreaming::UnitHitRecordUpdater hit_record_updater(hit_record_updater_config); - simulator.register_unit(&hit_record_updater); - simulator.new_unit_group(); - - l1d_config.num_ports = num_tps; -#ifdef DS_USE_RT_CORE - l1d_config.num_ports += 1 * l1d_config.num_ports / l1d_config.crossbar_width; //add extra port for RT core - l1d_config.crossbar_width += 1; -#endif - l1d_config.mem_highers = {&l2, &scene_buffer}; - for(uint tm_index = 0; tm_index < num_tms; ++tm_index) - { - std::vector unit_table((uint)ISA::RISCV::InstrType::NUM_TYPES, nullptr); - std::vector mem_list; - std::vector sfu_list; - - l1d_config.mem_higher_port = tm_index; - l1ds.push_back(_new UnitL1Cache(l1d_config, sim_config.get_int("use_scene_buffer") ? treelet_range : std::pair(0ull, 0ull))); - simulator.register_unit(l1ds.back()); - mem_list.push_back(l1ds.back()); - unit_table[(uint)ISA::RISCV::InstrType::LOAD] = l1ds.back(); - unit_table[(uint)ISA::RISCV::InstrType::STORE] = l1ds.back(); - - #if DS_USE_RT_CORE - rsbs.push_back(_new Units::DualStreaming::UnitRayStagingBuffer(1, tm_index, &stream_scheduler, &hit_record_updater)); - simulator.register_unit(rsbs.back()); - - Units::DualStreaming::UnitTreeletRTCore::Configuration rtc_config; - rtc_config.max_rays = 256; - rtc_config.num_tp = num_tps; - rtc_config.treelet_base_addr = (paddr_t)kernel_args.treelets; - rtc_config.hit_record_base_addr = (paddr_t)kernel_args.hit_records; - rtc_config.use_early_termination = sim_config.get_int("use_early"); - rtc_config.cache = l1ds.back(); - rtc_config.rsb = rsbs.back(); - - rtcs.push_back(_new Units::DualStreaming::UnitTreeletRTCore(rtc_config)); - simulator.register_unit(rtcs.back()); - mem_list.push_back(rtcs.back()); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM4] = rtcs.back(); //SWI - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM6] = rtcs.back(); //LHIT - #else - rsbs.push_back(_new Units::DualStreaming::UnitRayStagingBuffer(num_tps, tm_index, &stream_scheduler, &hit_record_updater)); - simulator.register_unit(rsbs.back()); - mem_list.push_back(rsbs.back()); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM3] = rsbs.back(); //LWI - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM4] = rsbs.back(); //SWI - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM5] = rsbs.back(); //CSHIT - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM6] = rsbs.back(); //LHIT - #endif - - thread_schedulers.push_back(_new Units::UnitThreadScheduler(num_tps, tm_index, &atomic_regs, 64)); - simulator.register_unit(thread_schedulers.back()); - mem_list.push_back(thread_schedulers.back()); - unit_table[(uint)ISA::RISCV::InstrType::ATOMIC] = thread_schedulers.back(); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM0] = thread_schedulers.back(); - - - //sfu_list.push_back(_new Units::UnitSFU(num_tps, 2, 1, num_tps)); - //simulator.register_unit(sfu_list.back()); - //unit_table[(uint)ISA::RISCV::InstrType::FADD] = sfu_list.back(); - //unit_table[(uint)ISA::RISCV::InstrType::FMUL] = sfu_list.back(); - //unit_table[(uint)ISA::RISCV::InstrType::FFMAD] = sfu_list.back(); - - sfu_list.push_back(_new Units::UnitSFU(num_tps / 8, 1, 1, num_tps)); - simulator.register_unit(sfu_list.back()); - unit_table[(uint)ISA::RISCV::InstrType::IMUL] = sfu_list.back(); - unit_table[(uint)ISA::RISCV::InstrType::IDIV] = sfu_list.back(); - - sfu_list.push_back(_new Units::UnitSFU(num_tps / 16, 6, 1, num_tps)); - simulator.register_unit(sfu_list.back()); - unit_table[(uint)ISA::RISCV::InstrType::FDIV] = sfu_list.back(); - unit_table[(uint)ISA::RISCV::InstrType::FSQRT] = sfu_list.back(); - - #if DS_USE_HARDWARE_INTERSECTORS - sfu_list.push_back(_new Units::UnitSFU(2, 3, 1, num_tps_per_tm)); - simulator.register_unit(sfu_list.back()); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM1] = sfu_list.back(); - - sfu_list.push_back(_new Units::UnitSFU(1, 22, 8, num_tps_per_tm)); - simulator.register_unit(sfu_list.back()); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM2] = sfu_list.back(); - #endif - - for(auto& sfu : sfu_list) - sfus.push_back(sfu); - - unit_tables.emplace_back(unit_table); - sfu_lists.emplace_back(sfu_list); - mem_lists.emplace_back(mem_list); - - for(uint tp_index = 0; tp_index < num_tps; ++tp_index) - { - Units::UnitTP::Configuration tp_config; - tp_config.tp_index = tp_index; - tp_config.tm_index = tm_index; - tp_config.stack_size = stack_size; - tp_config.cheat_memory = dram._data_u8; - tp_config.unit_table = &unit_tables.back(); - tp_config.unique_mems = &mem_lists.back(); - tp_config.unique_sfus = &sfu_lists.back(); - tp_config.num_threads = num_threads; - - tps.push_back(new Units::DualStreaming::UnitTP(tp_config)); - simulator.register_unit(tps.back()); - } - - simulator.new_unit_group(); - } - - printf("Starting Dual-Streaming\n"); - for(auto& tp : tps) - tp->set_entry_point(elf.elf_header->e_entry.u64); - - //master logs - Units::UnitBuffer::Log sram_log; - UnitDRAM::Log dram_log; - UnitL2Cache::Log l2_log; - UnitL1Cache::Log l1d_log; - Units::UnitTP::Log tp_log; - - Units::DualStreaming::UnitTreeletRTCore::Log rtc_log; - Units::DualStreaming::UnitSceneBuffer::Log sb_log; - Units::DualStreaming::UnitStreamScheduler::Log ss_log; - - uint delta = sim_config.get_int("logging_interval"); - auto start = std::chrono::high_resolution_clock::now(); - simulator.execute(delta, [&]() -> void - { - float delta_us = delta / (clock_rate / 1'000'000); - - Units::UnitBuffer::Log sram_delta_log = delta_log(sram_log, sram); - UnitDRAM::Log dram_delta_log = delta_log(dram_log, dram); - UnitL2Cache::Log l2_delta_log = delta_log(l2_log, l2); - UnitL1Cache::Log l1d_delta_log = delta_log(l1d_log, l1ds); - - Units::DualStreaming::UnitTreeletRTCore::Log rtc_delta_log = delta_log(rtc_log, rtcs); - Units::DualStreaming::UnitSceneBuffer::Log sb_delta_log = delta_log(sb_log, scene_buffer); - Units::DualStreaming::UnitStreamScheduler::Log ss_delta_log = delta_log(ss_log, stream_scheduler); - - printf(" \n"); - printf("Cycle: %lld \n", simulator.current_cycle); - printf("Threads Launched: %d \n", atomic_regs.iregs[0]); - printf("Buckets Launched: %lld \n", ss_log.buckets_launched); - printf("Segments Launched: %lld \n", ss_log.segments_launched); - printf("Prefetch queue size: %lld \n", stream_scheduler._l2_cache_prefetch_queue.size()); - printf(" \n"); - printf(" Ray Total: %8.1f bytes/cycle\n", (float)(ss_delta_log.buckets_generated + ss_delta_log.buckets_launched) * RAY_BUCKET_SIZE / delta); - printf(" Ray Write: %8.1f bytes/cycle\n", (float)ss_delta_log.buckets_generated * RAY_BUCKET_SIZE / delta); - printf(" Ray Read: %8.1f bytes/cycle\n", (float)ss_delta_log.buckets_launched * RAY_BUCKET_SIZE / delta); - printf(" Ray Write: %8.1f Mrays/s \n", (float)(ss_delta_log.buckets_generated) * Arches::Units::DualStreaming::RayBucket::MAX_RAYS / delta_us); - printf(" Ray Read: %8.1f Mrays/s \n", (float)(ss_delta_log.buckets_launched) * Arches::Units::DualStreaming::RayBucket::MAX_RAYS / delta_us); - printf(" \n"); - printf("Scene Fill: %8.1f bytes/cycle\n", (float)sb_delta_log.bytes_written / delta); - printf("Scene Read: %8.1f bytes/cycle\n", (float)sb_delta_log.bytes_read / delta); - printf(" \n"); - printf("DRAM Total: %8.1f bytes/cycle\n", (float)(dram_delta_log.bytes_read + dram_delta_log.bytes_written) / delta); - printf("DRAM Write: %8.1f bytes/cycle\n", (float)dram_delta_log.bytes_written / delta); - printf("DRAM Read: %8.1f bytes/cycle\n", (float)dram_delta_log.bytes_read / delta); - printf(" \n"); - printf("SRAM Total: %8.1f bytes/cycle\n", (float)(sram_delta_log.bytes_read + sram_delta_log.bytes_written) / delta); - printf("SRAM Write: %8.1f bytes/cycle\n", (float)sram_delta_log.bytes_written / delta); - printf("SRAM Read: %8.1f bytes/cycle\n", (float)sram_delta_log.bytes_read / delta); - printf(" \n"); - printf(" L2$ Read: %8.1f bytes/cycle\n", (float)l2_delta_log.bytes_read / delta); - printf(" L1d$ Read: %8.1f bytes/cycle\n", (float)l1d_delta_log.bytes_read / delta); - printf(" \n"); - printf(" L2$ Hit Rate: %8.1f%%\n", 100.0 * l2_delta_log.hits / l2_delta_log.get_total()); - printf("L1d$ Hit Rate: %8.1f%%\n", 100.0 * l1d_delta_log.hits / l1d_delta_log.get_total()); - printf(" \n"); - }); - auto stop = std::chrono::high_resolution_clock::now(); - - cycles_t frame_cycles = simulator.current_cycle; - double frame_time = frame_cycles / clock_rate; - double simulation_time = std::chrono::duration_cast(stop - start).count() / 1000.0; - float total_power = dram.total_power(); - - tp_log.print_profile(dram._data_u8); - - dram.print_stats(4, frame_cycles); - print_header("DRAM"); - delta_log(dram_log, dram); - dram_log.print(frame_cycles); - - print_header("SRAM"); - delta_log(sram_log, sram); - sram_log.print(frame_cycles); - - print_header("Stream Scheduler"); - delta_log(ss_log, stream_scheduler); - ss_log.print(); - - print_header("Scene Buffer"); - delta_log(sb_log, scene_buffer); - sb_log.print(frame_cycles); - total_power += sb_log.print_power(scene_buffer_power_config, frame_time); - - print_header("L2$"); - delta_log(l2_log, l2); - l2_log.print(frame_cycles); - total_power += l2_log.print_power(l2_power_config, frame_time); - - print_header("L1d$"); - delta_log(l1d_log, l1ds); - l1d_log.print(frame_cycles); - total_power += l1d_log.print_power(l1d_power_config, frame_time); - - print_header("TP"); - delta_log(tp_log, tps); - tp_log.print(tps.size()); - - if(!rtcs.empty()) - { - print_header("RT Core"); - delta_log(rtc_log, rtcs); - rtc_log.print(frame_cycles, rtcs.size()); - } - - float total_energy = total_power * frame_time; - - print_header("Performance Summary"); - printf("Cycles: %lld\n", frame_cycles); - printf("Clock rate: %.0f MHz\n", clock_rate / 1'000'000.0); - printf("Frame time: %.3g ms\n", frame_time * 1000.0); - printf("Mrays/s: %.0f\n", (float)ss_log.rays / frame_time / 1'000'000); - - print_header("Power Summary"); - printf("Energy: %.2f mJ\n", total_energy * 1000.0); - printf("Power: %.2f W\n", total_power); - printf("Mrays/J: %.2f\n", (float)ss_log.rays / total_energy / 1'000'000); - - print_header("Simulation Summary"); - printf("Simulation rate: %.2f KHz\n", frame_cycles / simulation_time / 1000.0); - printf("Simulation time: %.0f s\n", simulation_time); - - print_header("Treelet Histogram"); - -#if 0 - uint treelet_counts[16]; - std::map treelet_histos[16]; - - for(uint i = 0; i < 16; ++i) - treelet_counts[i] = 0; - - for(auto& a : l1d_log.profile_counters) - { - if(a.first >= treelet_range.first && a.first < treelet_range.second) - { - uint treelet_id = (a.first - (paddr_t)kernel_args.treelets) / rtm::PackedTreelet::SIZE; - paddr_t treelet_addr = (paddr_t)kernel_args.treelets + treelet_id * rtm::PackedTreelet::SIZE; - rtm::PackedTreelet::Header header; - dram.direct_read(&header, sizeof(rtm::PackedTreelet::Header), treelet_addr); - uint offset = a.first - treelet_addr; - - treelet_histos[header.depth][offset] += a.second; - - if(offset == 64) - treelet_counts[header.depth]++; - } - } - - for(uint i = 0; i < 16; ++i) - { - if(treelet_counts[i]) - { - printf("Depth %d (%d)\n", i, treelet_counts[i]); - uint64_t total = 0; - total += treelet_histos[i][64]; - for(auto& a : treelet_histos[i]) - { - printf("\t%6d:%.2f(%.2f%%)\n", a.first / 64, (double)a.second / treelet_counts[i], 100.0 * a.second / total); - } - printf("\n"); - } - } -#endif - stbi_flip_vertically_on_write(true); - dram.dump_as_png_uint8((paddr_t)kernel_args.framebuffer, kernel_args.framebuffer_width, kernel_args.framebuffer_height, "out.png"); - - for(auto& tp : tps) delete tp; - for(auto& sfu : sfus) delete sfu; - for(auto& l1d : l1ds) delete l1d; - for(auto& thread_scheduler : thread_schedulers) delete thread_scheduler; - for(auto& rtc : rtcs) delete rtc; - for(auto& rsb : rsbs) delete rsb; -} - -} -} diff --git a/src/arches-v2/main.cpp b/src/arches-v2/main.cpp index a71fc26..eec7a18 100644 --- a/src/arches-v2/main.cpp +++ b/src/arches-v2/main.cpp @@ -5,6 +5,7 @@ #include "units/trax/unit-rt-core.hpp" #include "trax-kernel/include.hpp" #include "trax-kernel/intersect.hpp" +#include "units/unit-texture.hpp" namespace Arches { @@ -108,6 +109,26 @@ const static InstructionInfo isa_custom0_funct3[8] = for(uint i = 0; i < sizeof(rtm::Ray) / sizeof(float); ++i) ((float*)mem_req.data)[i] = fr[instr.i.rs1 + i].f32; + return mem_req; + }), + InstructionInfo(0x6, "sample2d", InstrType::CUSTOM3, Encoding::I, RegFile::FLOAT, MEM_REQ_DECL + { + Register32* fr = unit->float_regs->registers; + + MemoryRequest mem_req; + mem_req.type = MemoryRequest::Type::STORE; + mem_req.size = 8; + mem_req.dst.push(DstReg(instr.rd, RegType::FLOAT32).u9, 9); + mem_req.vaddr = fr[instr.i.rs1].u32; + + ((float*)mem_req.data)[0] = fr[instr.i.rs1 + 1].f32; + ((float*)mem_req.data)[1] = fr[instr.i.rs1 + 2].f32; + + //fr[instr.i.rd + 0].f32 = fr[instr.i.rs1 + 1].f32; + //fr[instr.i.rd + 1].f32 = fr[instr.i.rs1 + 2].f32; + //fr[instr.i.rd + 2].f32 = 0.5f; + //fr[instr.i.rd + 3].f32 = 1.0f; + return mem_req; }), }; @@ -162,6 +183,8 @@ static TRaXKernelArgs initilize_buffers(Units::UnitMainMemoryBase** drams, const args.rays = write_vector(drams, xbar, 256, rays, heap_address); } + args.materials = write_vector(drams, xbar, 256, mesh.materials, heap_address); + args.nodes = write_vector(drams, xbar, 256, bvh.nodes, heap_address); #if USE_HECWBVH_V1 @@ -188,7 +211,8 @@ static TRaXKernelArgs initilize_buffers(Units::UnitMainMemoryBase** drams, const tex.texels = dev_tex; } - args.materials = write_vector(drams, xbar, 256, mesh.materials, heap_address); + paddr_t mat_addr = (paddr_t)args.materials; + args.materials = write_vector(drams, xbar, 256, mesh.materials, mat_addr); for(uint32_t i = 0; i < mesh.materials.size(); ++i) mesh.materials[i].albedo_texture.texels = nullptr; @@ -388,6 +412,8 @@ static void run_sim_trax(SimulationConfig& sim_config) UnitL1Cache::PowerConfig l1d_power_config; + Units::UnitTexture::Configuration tu_config; + UnitRTCore::Configuration rtc_config; rtc_config.max_rays = 64; rtc_config.num_cache_ports = 4; @@ -451,6 +477,7 @@ static void run_sim_trax(SimulationConfig& sim_config) ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM0] = "FCHTHRD"; ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM1] = "BOXISECT"; ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM2] = "TRIISECT"; + ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM3] = "SAMPLE2D"; ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM7] = "TRACERAY"; ISA::RISCV::isa[ISA::RISCV::CUSTOM_OPCODE0] = ISA::RISCV::TRaX::custom0; @@ -461,6 +488,7 @@ static void run_sim_trax(SimulationConfig& sim_config) std::vector sfus; std::vector thread_schedulers; std::vector rtcs; + std::vector tus; std::vector l1ds; std::vector> unit_tables; unit_tables.reserve(num_tms); std::vector> sfu_lists; sfu_lists.reserve(num_tms); @@ -500,6 +528,9 @@ static void run_sim_trax(SimulationConfig& sim_config) TRaXKernelArgs kernel_args = initilize_buffers((Units::UnitMainMemoryBase**)drams.data(), xbar, heap_address, sim_config, partition_stride); heap_address = align_to(partition_stride, heap_address); + for(uint addr = 0; addr < (256 << 20); addr += partition_stride) + drams[xbar.get_partition(addr)]->direct_read(vec_mem.data() + addr, partition_stride, xbar.strip_partition_bits(addr)); + //bool warm_l2 = false; //if(warm_l2) //{ @@ -576,6 +607,16 @@ static void run_sim_trax(SimulationConfig& sim_config) //l1ss.push_back(new Units::UnitStreamCache(l1s_config)); //simulator.register_unit(l1ss.back()); + tu_config.num_clients = num_tps; + tu_config.cache = l1ds.back(); + tu_config.cache_port = num_tps + 1; + tu_config.cheat_mem = vec_mem.data(); + + tus.push_back(_new Units::UnitTexture(tu_config)); + simulator.register_unit(tus.back()); + mem_list.push_back(tus.back()); + unit_table[(uint)ISA::RISCV::InstrType::CUSTOM3] = tus.back(); + #if TRAX_USE_RT_CORE rtc_config.num_clients = num_tps; rtc_config.node_base_addr = (paddr_t)kernel_args.nodes; diff --git a/src/arches-v2/trax.hpp b/src/arches-v2/trax.hpp deleted file mode 100644 index ef695ec..0000000 --- a/src/arches-v2/trax.hpp +++ /dev/null @@ -1,730 +0,0 @@ -#pragma once - -#include "shared-utils.hpp" -#include "units/trax/unit-tp.hpp" -#include "units/trax/unit-rt-core.hpp" -#include "trax-kernel/include.hpp" -#include "trax-kernel/intersect.hpp" - -namespace Arches { - -namespace ISA { namespace RISCV { namespace TRaX { - -//see the opcode map for details -const static InstructionInfo isa_custom0_000_imm[8] = -{ - InstructionInfo(0x0, "fchthrd", InstrType::CUSTOM0, Encoding::U, RegFile::INT, MEM_REQ_DECL - { - MemoryRequest req; - req.type = MemoryRequest::Type::LOAD; - req.size = sizeof(uint32_t); - req.dst.push(DstReg(instr.rd, RegType::UINT32).u9, 9); - req.vaddr = 0x0ull; - return req; - }), - InstructionInfo(0x1, "boxisect", InstrType::CUSTOM1, Encoding::U, RegFile::FLOAT, EXEC_DECL - { - Register32 * fr = unit->float_regs->registers; - - rtm::Ray ray; - rtm::vec3 inv_d; - ray.o.x = fr[0].f32; - ray.o.y = fr[1].f32; - ray.o.z = fr[2].f32; - ray.t_min = fr[3].f32; - inv_d.x = fr[4].f32; - inv_d.y = fr[5].f32; - inv_d.z = fr[6].f32; - ray.t_max = fr[7].f32; - - rtm::AABB aabb; - aabb.min.x = fr[8].f32; - aabb.min.y = fr[9].f32; - aabb.min.z = fr[10].f32; - aabb.max.x = fr[11].f32; - aabb.max.y = fr[12].f32; - aabb.max.z = fr[13].f32; - - unit->float_regs->registers[instr.u.rd].f32 = rtm::intersect(aabb, ray, inv_d); - }), - InstructionInfo(0x2, "triisect", InstrType::CUSTOM2, Encoding::U, RegFile::FLOAT, EXEC_DECL - { - Register32 * fr = unit->float_regs->registers; - - rtm::Ray ray; - ray.o.x = fr[0].f32; - ray.o.y = fr[1].f32; - ray.o.z = fr[2].f32; - ray.t_min = fr[3].f32; - ray.d.x = fr[4].f32; - ray.d.y = fr[5].f32; - ray.d.z = fr[6].f32; - ray.t_max = fr[7].f32; - - rtm::Triangle tri; - tri.vrts[0].x = fr[8].f32; - tri.vrts[0].y = fr[9].f32; - tri.vrts[0].z = fr[10].f32; - tri.vrts[1].x = fr[11].f32; - tri.vrts[1].y = fr[12].f32; - tri.vrts[1].z = fr[13].f32; - tri.vrts[2].x = fr[14].f32; - tri.vrts[2].y = fr[15].f32; - tri.vrts[2].z = fr[16].f32; - - rtm::Hit hit; - hit.t = fr[17].f32; - hit.bc[0] = fr[18].f32; - hit.bc[1] = fr[19].f32; - hit.id = fr[20].u32; - - rtm::intersect(tri, ray, hit); - - fr[17].f32 = hit.t; - fr[18].f32 = hit.bc[0]; - fr[19].f32 = hit.bc[1]; - fr[20].u32 = hit.id; - }), -}; - -const static InstructionInfo isa_custom0_funct3[8] = -{ - InstructionInfo(0x0, META_DECL{return isa_custom0_000_imm[instr.u.imm_31_12 >> 3]; }), - InstructionInfo(0x1, IMPL_NONE), - InstructionInfo(0x2, IMPL_NONE), - InstructionInfo(0x3, IMPL_NONE), - InstructionInfo(0x4, IMPL_NONE), - InstructionInfo(0x5, "traceray", InstrType::CUSTOM7, Encoding::I, RegFile::FLOAT, MEM_REQ_DECL - { - MemoryRequest mem_req; - mem_req.type = MemoryRequest::Type::STORE; - mem_req.size = sizeof(rtm::Ray); - mem_req.dst.push(DstReg(instr.rd, RegType::FLOAT32).u9, 9); - mem_req.vaddr = 0xdeadbeefull; - - Register32* fr = unit->float_regs->registers; - for(uint i = 0; i < sizeof(rtm::Ray) / sizeof(float); ++i) - ((float*)mem_req.data)[i] = fr[instr.i.rs1 + i].f32; - - return mem_req; - }), -}; - -const static InstructionInfo custom0(CUSTOM_OPCODE0, META_DECL{return isa_custom0_funct3[instr.i.funct3];}); - -}}} - -namespace TRaX { - -typedef Units::UnitDRAMRamulator UnitDRAM; -typedef Units::UnitCache UnitL2Cache; -typedef Units::UnitCache UnitL1Cache; -typedef rtm::FTB PrimBlocks; -typedef Units::TRaX::UnitRTCore UnitRTCore; - -static TRaXKernelArgs initilize_buffers(uint8_t* main_memory, paddr_t& heap_address, const SimulationConfig& sim_config, uint page_size) -{ - std::string scene_name = sim_config.get_string("scene_name"); - std::string project_folder = get_project_folder_path(); - std::string datasets_folder = project_folder + "datasets\\"; - std::string cache_folder = project_folder + "datasets\\cache\\"; - - TRaXKernelArgs args; - args.framebuffer_width = sim_config.get_int("framebuffer_width"); - args.framebuffer_height = sim_config.get_int("framebuffer_height"); - args.framebuffer_size = args.framebuffer_width * args.framebuffer_height; - heap_address = align_to(page_size, heap_address); - args.framebuffer = reinterpret_cast(heap_address); - heap_address += args.framebuffer_size * sizeof(uint32_t); - - args.pregen_rays = sim_config.get_int("pregen_rays"); - uint pregen_bounce = sim_config.get_int("pregen_bounce"); - - args.light_dir = rtm::normalize(rtm::vec3(4.5f, 42.5f, 5.0f)); - args.camera = sim_config.camera; - - rtm::Mesh mesh(datasets_folder + scene_name + ".obj"); - rtm::CWBVH bvh(mesh, (cache_folder + scene_name + ".bvh").c_str()); - - std::vector rays(args.framebuffer_size); - if(args.pregen_rays) - { - std::string ray_file = scene_name + "-" + std::to_string(args.framebuffer_width) + "-" + std::to_string(pregen_bounce) + ".rays"; - #if USE_HECWBVH - pregen_rays(&bvh.nodes[0], &bvh.nodes[0].ftb, mesh, args.framebuffer_width, args.framebuffer_height, args.camera, pregen_bounce, rays); - #else - pregen_rays(&bvh.nodes[0], &bvh.ftbs[0], mesh, args.framebuffer_width, args.framebuffer_height, args.camera, pregen_bounce, rays); - #endif - args.rays = write_vector(main_memory, 256, rays, heap_address); - } - - args.nodes = write_vector(main_memory, 256, bvh.nodes, heap_address); - -#if USE_HECWBVH - args.ftbs = (rtm::FTB*)args.nodes; -#else - args.ftbs = write_vector(main_memory, 256, bvh.ftbs, heap_address); -#endif - - std::vector tris; - mesh.get_triangles(tris); - args.tris = write_vector(main_memory, 256, tris, heap_address); - - std::memcpy(main_memory + TRAX_KERNEL_ARGS_ADDRESS, &args, sizeof(TRaXKernelArgs)); - return args; -} - -static void run_sim_trax(SimulationConfig& sim_config) -{ - std::string project_folder_path = get_project_folder_path(); - -#if 0 //RTX 4090 ish - //Compute - double core_clock = 2235.0e6; - double dram_clock = 5250.0e6; - uint64_t stack_size = 512; - uint num_tms = 128; - uint num_tps = 128; - uint num_threads = 12; - - //Memory - uint64_t block_size = CACHE_BLOCK_SIZE; - uint num_partitions = 12; - uint partition_stride = 1 << 12; - - //DRAM - UnitDRAM::Configuration dram_config; - dram_config.config_path = project_folder_path + "build\\src\\arches-v2\\config-files\\gddr6x_21000_config.yaml"; - dram_config.size = 1ull << 30; //1GB per partition - dram_config.clock_ratio = dram_clock / core_clock; - dram_config.latency = 254; - - //L2$ - UnitL2Cache::Configuration l2_config; - l2_config.level = 2; - //l2_config.block_prefetch = true; - l2_config.miss_alloc = true; - l2_config.size = 6 << 20; - l2_config.associativity = 16; - l2_config.policy = Units::UnitCacheBase::Policy::LRU; - l2_config.num_slices = 6; - l2_config.crossbar_width = l2_config.num_slices; - l2_config.num_mshr = 256; - l2_config.num_subentries = 4; - l2_config.latency = 187; - - UnitL2Cache::PowerConfig l2_power_config; - - Units::UnitCrossbar::Configuration xbar_config; - xbar_config.num_slices = l2_config.num_slices; - xbar_config.slice_stride = l2_config.block_size; - xbar_config.num_partitions = num_partitions; - xbar_config.partition_stride = partition_stride; - - //L1d$ - UnitL1Cache::Configuration l1d_config; - l1d_config.level = 1; - l1d_config.miss_alloc = true; - l1d_config.size = 128 << 10; - l1d_config.associativity = 32; - l1d_config.policy = Units::UnitCacheBase::Policy::LRU; - l1d_config.num_banks = 4; - l1d_config.crossbar_width = l1d_config.num_banks; - l1d_config.num_mshr = 512; - l1d_config.num_subentries = 16; - l1d_config.latency = 39; - - UnitL1Cache::PowerConfig l1d_power_config; - - UnitRTCore::Configuration rtc_config; - rtc_config.max_rays = 32; - rtc_config.num_cache_ports = 4; - -#elif 0 //RTX 3070 ish - //Compute - double core_clock = 1500.0e6; - uint64_t stack_size = 512; - uint num_tms = 46; - uint num_tps = 128; - uint num_threads = 12; - - //Memory - double dram_clock = 3500.0e6; - uint num_partitions = 8; - uint partition_stride = 1 << 12; - - //DRAM - UnitDRAM::Configuration dram_config; - dram_config.config_path = project_folder_path + "build\\src\\arches-v2\\config-files\\gddr6_14000_config.yaml"; - dram_config.size = 1ull << 30; //1GB per partition - dram_config.clock_ratio = dram_clock / core_clock; - dram_config.latency = 254; - - //L2$ - UnitL2Cache::Configuration l2_config; - l2_config.level = 2; - l2_config.miss_alloc = true; - l2_config.size = 512 << 10; - l2_config.associativity = 16; - l2_config.policy = Units::UnitCacheBase::Policy::LRU; - l2_config.num_slices = 4; - l2_config.crossbar_width = l2_config.num_slices; - l2_config.num_mshr = 192; - l2_config.num_subentries = 4; - l2_config.latency = 187; - - UnitL2Cache::PowerConfig l2_power_config; - - Units::UnitCrossbar::Configuration xbar_config; - xbar_config.num_slices = l2_config.num_slices; - xbar_config.slice_stride = l2_config.block_size; - xbar_config.num_partitions = num_partitions; - xbar_config.partition_stride = partition_stride; - - //L1d$ - UnitL1Cache::Configuration l1d_config; - l1d_config.level = 1; - l1d_config.miss_alloc = true; - l1d_config.size = 128 << 10; - l1d_config.associativity = 32; - l1d_config.policy = Units::UnitCacheBase::Policy::FIFO; - l1d_config.num_banks = 4; - l1d_config.crossbar_width = l1d_config.num_banks; - l1d_config.num_mshr = 384; - l1d_config.num_subentries = 48; - l1d_config.latency = 39; - - UnitL1Cache::PowerConfig l1d_power_config; - - UnitRTCore::Configuration rtc_config; - rtc_config.max_rays = 128; - rtc_config.num_cache_ports = 4; - -#elif 1 //Turing spec -#if 1 //RTX 2080 - double core_clock = 1515.0e6; - uint num_threads = 8; - uint num_tps = 64; - uint num_tms = 46; - uint64_t stack_size = 1024; - - double dram_clock = 3500.0e6; - uint num_partitions = 8; - uint partition_stride = 1 << 12; -#else //RTX 2060 - double core_clock = 1365.0e6; - uint num_threads = 1; - uint num_tps = 64; - uint num_tms = 30; - uint64_t stack_size = 512; - - double dram_clock = 3500.0e6; - uint num_partitions = 6; - uint partition_stride = 1 << 12; -#endif - - //DRAM - UnitDRAM::Configuration dram_config; - dram_config.config_path = project_folder_path + "build\\src\\arches-v2\\config-files\\gddr6_14000_config.yaml"; - dram_config.size = 1ull << 30; //1GB - dram_config.clock_ratio = dram_clock / core_clock; - dram_config.latency = 92; - - //L2$ - UnitL2Cache::Configuration l2_config; - l2_config.level = 2; - l2_config.miss_alloc = true; - l2_config.size = 512 << 10; - l2_config.associativity = 16; - l2_config.num_slices = 4; - l2_config.crossbar_width = l2_config.num_slices; - l2_config.num_mshr = 192; - l2_config.num_subentries = 4; - l2_config.latency = 160; - - UnitL2Cache::PowerConfig l2_power_config; - - Units::UnitCrossbar::Configuration xbar_config; - xbar_config.num_slices = l2_config.num_slices; - xbar_config.slice_stride = l2_config.block_size; - xbar_config.num_partitions = num_partitions; - xbar_config.partition_stride = partition_stride; - - //L1d$ - UnitL1Cache::Configuration l1d_config; - l1d_config.level = 1; - l1d_config.miss_alloc = true; - l1d_config.size = 64 << 10; - l1d_config.associativity = 32; - l1d_config.num_banks = 16; - l1d_config.crossbar_width = l1d_config.num_banks; - l1d_config.num_mshr = 256; - l1d_config.num_subentries = 16; - l1d_config.latency = 20; - - UnitL1Cache::PowerConfig l1d_power_config; - - UnitRTCore::Configuration rtc_config; - rtc_config.max_rays = 64; - rtc_config.num_cache_ports = 2; -#else //TRaX 1.0 - double core_clock = 1000.0e6; - uint num_threads = 1; - uint num_tps = 32; - uint num_tms = 32; - uint num_rays = 32; - uint64_t stack_size = 4096; - - double dram_clock = 2000.0e6; - uint num_partitions = 4; - uint partition_stride = 1 << 12; - - //DRAM - UnitDRAM::Configuration dram_config; - dram_config.config_path = project_folder_path + "build\\src\\arches-v2\\config-files\\gddr6_pch_config.yaml"; - dram_config.size = 1ull << 30; //1GB - dram_config.clock_ratio = dram_clock / core_clock; - dram_config.latency = 1; - //dram_config.latency = 56; - - //L2$ - UnitL2Cache::Configuration l2_config; - l2_config.level = 2; - l2_config.miss_alloc = false; - l2_config.size = 256 << 10; - l2_config.associativity = 16; - l2_config.num_slices = 1; - l2_config.crossbar_width = l2_config.num_slices; - l2_config.num_mshr = 1; - l2_config.num_subentries = 1; - l2_config.latency = 10; - - UnitL2Cache::PowerConfig l2_power_config; - - Units::UnitCrossbar::Configuration xbar_config; - xbar_config.num_slices = l2_config.num_slices; - xbar_config.slice_stride = l2_config.block_size; - xbar_config.num_partitions = num_partitions; - xbar_config.partition_stride = partition_stride; - - //L1d$ - UnitL1Cache::Configuration l1d_config; - l1d_config.level = 1; - l1d_config.miss_alloc = false; - l1d_config.size = 32 << 10; - l1d_config.associativity = 4; - l1d_config.num_banks = 8; - l1d_config.crossbar_width = l1d_config.num_banks; - l1d_config.num_mshr = 256; - l1d_config.num_subentries = 16; - l1d_config.latency = 1; - - UnitL1Cache::PowerConfig l1d_power_config; -#endif - - ELF elf(project_folder_path + "src\\trax-kernel\\riscv\\kernel"); - - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM0] = "FCHTHRD"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM1] = "BOXISECT"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM2] = "TRIISECT"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM7] = "TRACERAY"; - ISA::RISCV::isa[ISA::RISCV::CUSTOM_OPCODE0] = ISA::RISCV::TRaX::custom0; - - uint num_sfus = static_cast(ISA::RISCV::InstrType::NUM_TYPES) * num_tms; - - Simulator simulator; - std::vector tps; - std::vector sfus; - std::vector thread_schedulers; - std::vector rtcs; - std::vector l1ds; - std::vector> unit_tables; unit_tables.reserve(num_tms); - std::vector> sfu_lists; sfu_lists.reserve(num_tms); - std::vector> mem_lists; mem_lists.reserve(num_tms); - - //construct memory partitions - std::vector drams; - std::vector l2s; - dram_config.num_ports = l2_config.num_slices; - l2_config.num_ports = l2_config.num_slices; - for(uint i = 0; i < num_partitions; ++i) - { - drams.push_back(_new UnitDRAM(dram_config)); - simulator.register_unit(drams.back()); - - l2_config.mem_higher_port = 0; - l2_config.mem_highers = {drams.back()}; - l2s.push_back(_new UnitL2Cache(l2_config)); - simulator.register_unit(l2s.back()); - simulator.new_unit_group(); - - xbar_config.mem_highers.push_back(l2s.back()); - } - - xbar_config.num_clients = num_tms; - Units::UnitCrossbar xbar(xbar_config); - simulator.register_unit(&xbar); - simulator.new_unit_group(); - - uint8_t* device_mem = (uint8_t*)malloc(3 << 29); - paddr_t heap_address = elf.load(device_mem); - TRaXKernelArgs kernel_args = initilize_buffers(device_mem, heap_address, sim_config, partition_stride); - heap_address = align_to(partition_stride, heap_address); - - for(uint addr = 0; addr < heap_address; addr += partition_stride) - drams[xbar.get_partition(addr)]->direct_write(device_mem + addr, partition_stride, xbar.strip_partition_bits(addr)); - - bool warm_l2 = false; - if(warm_l2) - { - paddr_t start = (paddr_t)kernel_args.nodes & ~(1 - partition_stride); - paddr_t end = start + l2_config.size * num_partitions; - for(paddr_t block_addr = end - l2_config.block_size; block_addr >= start; block_addr -= l2_config.block_size) - l2s[xbar.get_partition(block_addr)]->direct_write(xbar.strip_partition_bits(block_addr), device_mem + block_addr); - } - - bool deserialize_l2 = false, serialize_l2 = !deserialize_l2; - if(deserialize_l2) - for(uint i = 0; i < num_partitions; ++i) - serialize_l2 = !l2s[i]->deserialize("l2-p" + std::to_string(i) + ".bin", *drams[i]); - - Units::UnitAtomicRegfile atomic_regs(num_tms); - simulator.register_unit(&atomic_regs); - simulator.new_unit_group(); - - l1d_config.num_ports = num_tps; - l1d_config.mem_highers = {&xbar}; -#if TRAX_USE_RT_CORE - l1d_config.num_ports += num_tps; - l1d_config.crossbar_width *= 2; -#endif - - for(uint tm_index = 0; tm_index < num_tms; ++tm_index) - { - std::vector unit_table((uint)ISA::RISCV::InstrType::NUM_TYPES, nullptr); - std::vector mem_list; - std::vector sfu_list; - - l1d_config.mem_higher_port = tm_index; - l1ds.push_back(new UnitL1Cache(l1d_config)); - simulator.register_unit(l1ds.back()); - mem_list.push_back(l1ds.back()); - unit_table[(uint)ISA::RISCV::InstrType::LOAD] = l1ds.back(); - unit_table[(uint)ISA::RISCV::InstrType::STORE] = l1ds.back(); - - thread_schedulers.push_back(_new Units::UnitThreadScheduler(num_tps, tm_index, &atomic_regs, 32)); - simulator.register_unit(thread_schedulers.back()); - mem_list.push_back(thread_schedulers.back()); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM0] = thread_schedulers.back(); - - //sfu_list.push_back(_new Units::UnitSFU(num_tps_per_tm, 2, 1, num_tps_per_tm)); - //simulator.register_unit(sfu_list.back()); - //unit_table[(uint)ISA::RISCV::InstrType::FADD] = sfu_list.back(); - //unit_table[(uint)ISA::RISCV::InstrType::FMUL] = sfu_list.back(); - //unit_table[(uint)ISA::RISCV::InstrType::FFMAD] = sfu_list.back(); - - sfu_list.push_back(_new Units::UnitSFU(num_tps / 8, 1, 1, num_tps)); - simulator.register_unit(sfu_list.back()); - unit_table[(uint)ISA::RISCV::InstrType::IMUL] = sfu_list.back(); - unit_table[(uint)ISA::RISCV::InstrType::IDIV] = sfu_list.back(); - - sfu_list.push_back(_new Units::UnitSFU(num_tps / 16, 6, 1, num_tps)); - simulator.register_unit(sfu_list.back()); - unit_table[(uint)ISA::RISCV::InstrType::FDIV] = sfu_list.back(); - unit_table[(uint)ISA::RISCV::InstrType::FSQRT] = sfu_list.back(); - - #if TRAX_USE_HARDWARE_INTERSECTORS - sfu_list.push_back(_new Units::UnitSFU(2, 3, 1, num_tps_per_tm)); - simulator.register_unit(sfu_list.back()); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM1] = sfu_list.back(); - - sfu_list.push_back(_new Units::UnitSFU(1, 22, 8, num_tps_per_tm)); - simulator.register_unit(sfu_list.back()); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM2] = sfu_list.back(); - #endif - - for(auto& sfu : sfu_list) - sfus.push_back(sfu); - - //l1s_config.mem_higher_port = tm_index * 2 + 1; - //l1ss.push_back(new Units::UnitStreamCache(l1s_config)); - //simulator.register_unit(l1ss.back()); - - #if TRAX_USE_RT_CORE - rtc_config.num_clients = num_tps; - rtc_config.node_base_addr = (paddr_t)kernel_args.nodes; - rtc_config.tri_base_addr = (paddr_t)kernel_args.ftbs; - rtc_config.cache = l1ds.back(); - rtc_config.cache_port = num_tps; - rtc_config.cache_port_stride = num_tps / l1d_config.num_banks; - - rtcs.push_back(_new UnitRTCore(rtc_config)); - simulator.register_unit(rtcs.back()); - mem_list.push_back(rtcs.back()); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM7] = rtcs.back(); - #endif - - unit_tables.emplace_back(unit_table); - sfu_lists.emplace_back(sfu_list); - mem_lists.emplace_back(mem_list); - - Units::UnitTP::Configuration tp_config; - tp_config.tm_index = tm_index; - tp_config.stack_size = stack_size; - tp_config.cheat_memory = device_mem; - tp_config.unique_mems = &mem_lists.back(); - tp_config.unique_sfus = &sfu_lists.back(); - tp_config.num_threads = num_threads; - for(uint tp_index = 0; tp_index < num_tps; ++tp_index) - { - tp_config.tp_index = tp_index; - tp_config.unit_table = &unit_tables.back(); - tps.push_back(new Units::TRaX::UnitTP(tp_config)); - simulator.register_unit(tps.back()); - } - - simulator.new_unit_group(); - } - - printf("Starting TRaX\n"); - for(auto& tp : tps) - tp->set_entry_point(elf.elf_header->e_entry.u64); - - //master logs - UnitDRAM::Log dram_log; - UnitL2Cache::Log l2_log; - UnitL1Cache::Log l1d_log; - Units::UnitTP::Log tp_log; - - UnitRTCore::Log rtc_log; - - uint delta = sim_config.get_int("logging_interval"); - float delta_s = delta / core_clock; - float delta_ns = delta_s * 1e9; - float delta_dram_cycles = delta_s * dram_clock; - float peak_dram_bandwidth = dram_clock / core_clock * num_partitions * 4 * 32 / 8; - float peak_l2_bandwidth = num_partitions * l2_config.num_slices * MemoryRequest::MAX_SIZE; - float peak_l1d_bandwidth = num_tms * l1d_config.num_banks * MemoryRequest::MAX_SIZE; - - //peak_dram_bandwidth = 32 * core_clock * num_partitions; - //peak_l2_bandwidth = 32 * num_tms / 2; - - auto start = std::chrono::high_resolution_clock::now(); - simulator.execute(delta, [&]() -> void - { - UnitDRAM::Log dram_delta_log = delta_log(dram_log, drams); - UnitL2Cache::Log l2_delta_log = delta_log(l2_log, l2s); - UnitL1Cache::Log l1d_delta_log = delta_log(l1d_log, l1ds); - UnitRTCore::Log rtc_delta_log = delta_log(rtc_log, rtcs); - - double simulation_time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start).count() / 1000.0; - - printf(" \n"); - printf("Cycle: %lld \n", simulator.current_cycle); - printf("Threads Launched: %d \n", atomic_regs.iregs[0]); - printf("Simulation rate: %.2f KHz\n", simulator.current_cycle / simulation_time / 1000.0); - printf(" \n"); - printf("DRAM Read: %8.1f GB/s (%.2f%%)\n", (float)dram_delta_log.bytes_read / delta_ns, 100.0f * dram_delta_log.bytes_read / delta / peak_dram_bandwidth); - printf(" L2$ Read: %8.1f B/clk (%.2f%%)\n", (float)l2_delta_log.bytes_read / delta, 100.0f * l2_delta_log.bytes_read / delta / peak_l2_bandwidth); - printf("L1d$ Read: %8.1f B/clk (%.2f%%)\n", (float)l1d_delta_log.bytes_read / delta, 100.0 * l1d_delta_log.bytes_read / delta / peak_l1d_bandwidth); - printf(" \n"); - printf(" L2$ Hit/Half/Miss: %3.1f%%/%3.1f%%/%3.1f%%\n", 100.0 * l2_delta_log.hits / l2_delta_log.get_total(), 100.0 * l2_delta_log.half_misses / l2_delta_log.get_total(), 100.0 * l2_delta_log.misses / l2_delta_log.get_total()); - printf("L1d$ Hit/Half/Miss: %3.1f%%/%3.1f%%/%3.1f%%\n", 100.0 * l1d_delta_log.hits / l1d_delta_log.get_total(), 100.0 * l1d_delta_log.half_misses / l1d_delta_log.get_total(), 100.0 * l1d_delta_log.misses / l1d_delta_log.get_total()); - printf(" \n"); - printf(" L2$ Stalls: %0.2f%%\n", 100.0 * l2_delta_log.mshr_stalls / num_partitions / l2_config.num_slices / l2_config.num_banks / delta); - printf("L1d$ Stalls: %0.2f%%\n", 100.0 * l1d_delta_log.mshr_stalls / num_tms / l1d_config.num_banks / delta); - printf(" \n"); - printf("L2$ Occ: %0.2f%%\n", 100.0 * l2_delta_log.get_total() / num_partitions / l2_config.num_slices / l2_config.num_banks / delta); - printf("L1d$ Occ: %0.2f%%\n", 100.0 * l1d_delta_log.get_total() / num_tms / l1d_config.num_banks / delta); - printf(" \n"); - if(!rtcs.empty()) - { - printf("MRays/s: %.0f\n\n", rtc_delta_log.rays / delta_ns * 1000.0); - rtc_delta_log.print(rtcs.size()); - } - }); - - auto stop = std::chrono::high_resolution_clock::now(); - - for(uint addr = 0; addr < heap_address; addr += partition_stride) - drams[xbar.get_partition(addr)]->direct_read(device_mem + addr, partition_stride, xbar.strip_partition_bits(addr)); - - if(serialize_l2) - for(uint i = 0; i < num_partitions; ++i) - l2s[i]->serialize("l2-p" + std::to_string(i) + ".bin"); - - cycles_t frame_cycles = simulator.current_cycle; - double frame_time = frame_cycles / core_clock; - double frame_time_ns = frame_time * 1e9; - double simulation_time = std::chrono::duration_cast(stop - start).count() / 1000.0; - - tp_log.print_profile(device_mem); - - float total_power = 0.0f; - for(auto& dram : drams) - { - dram->print_stats(4, frame_cycles); - total_power += dram->total_power(); - } - print_header("DRAM"); - delta_log(dram_log, drams); - printf("DRAM Read: %.1f GB/s (%.2f%%)\n", (float)dram_log.bytes_read / frame_time_ns, 100.0f * dram_log.bytes_read / frame_cycles / peak_dram_bandwidth); - dram_log.print(frame_cycles); - - print_header("L2$"); - delta_log(l2_log, l2s); - printf(" L2$ Read: %.1f B/clk (%.2f%%)\n", (float)l2_log.bytes_read / frame_cycles, 100.0f * l2_log.bytes_read / frame_cycles / peak_l2_bandwidth); - l2_log.print(frame_cycles); - total_power += l2_log.print_power(l2_power_config, frame_time); - - print_header("L1d$"); - delta_log(l1d_log, l1ds); - printf("L1d$ Read: %.1f B/clk (%.2f%%)\n", (float)l1d_log.bytes_read / frame_cycles, 100.0 * l1d_log.bytes_read / frame_cycles / peak_l1d_bandwidth); - l1d_log.print(frame_cycles); - total_power += l1d_log.print_power(l1d_power_config, frame_time); - - print_header("TP"); - delta_log(tp_log, tps); - tp_log.print(tps.size()); - - if(!rtcs.empty()) - { - print_header("RT Core"); - delta_log(rtc_log, rtcs); - rtc_log.print(rtcs.size()); - } - - float total_energy = total_power * frame_time; - - print_header("Performance Summary"); - printf("Cycles: %lld\n", simulator.current_cycle); - printf("Clock rate: %.0f MHz\n", core_clock / 1'000'000.0); - printf("Frame time: %.3g ms\n", frame_time * 1000.0); - if(!rtcs.empty()) printf("MRays/s: %.0f\n", rtc_log.rays / frame_time / 1'000'000.0); - else printf("MRays/s: %.0f\n", kernel_args.framebuffer_size / frame_time / 1'000'000.0); - - print_header("Power Summary"); - printf("Energy: %.2f mJ\n", total_power * frame_time * 1000.0); - printf("Power: %.2f W\n", total_power); - if(!rtcs.empty()) printf("MRays/J: %.2f\n", rtc_log.rays / total_energy / 1'000'000.0); - else printf("MRays/J: %.2f\n", kernel_args.framebuffer_size / total_energy / 1'000'000.0); - - print_header("Simulation Summary"); - printf("Simulation rate: %.2f KHz\n", simulator.current_cycle / simulation_time / 1000.0); - printf("Simulation time: %.0f s\n", simulation_time); - printf("MSIPS: %.2f\n", simulator.current_cycle * tps.size() / simulation_time / 1'000'000.0); - - stbi_flip_vertically_on_write(true); - stbi_write_png("out.png", (int)kernel_args.framebuffer_width, (int)kernel_args.framebuffer_height, 4, device_mem + (size_t)kernel_args.framebuffer, 0); - free(device_mem); - - for(auto& tp : tps) delete tp; - for(auto& sfu : sfus) delete sfu; - for(auto& l1d : l1ds) delete l1d; - for(auto& thread_scheduler : thread_schedulers) delete thread_scheduler; - for(auto& rtc : rtcs) delete rtc; - for(auto& l2 : l2s) delete l2; - for(auto& dram : drams) delete dram; -} -} -} \ No newline at end of file diff --git a/src/arches-v2/units/trax/unit-tp.hpp b/src/arches-v2/units/trax/unit-tp.hpp index e2c9ffc..851bc80 100644 --- a/src/arches-v2/units/trax/unit-tp.hpp +++ b/src/arches-v2/units/trax/unit-tp.hpp @@ -31,6 +31,16 @@ class UnitTP : public Arches::Units::UnitTP if(float_regs_pending[i]) return float_regs_pending[i]; } + else if(instr_info.instr_type == ISA::RISCV::InstrType::CUSTOM3) //SAMPLE2D + { + for(uint i = 0; i < 3; ++i) + if(float_regs_pending[instr.rs1 + i]) + return float_regs_pending[instr.rs1 + i]; + + for(uint i = 0; i < 4; ++i) + if(float_regs_pending[instr.rd + i]) + return float_regs_pending[instr.rd + i]; + } else if(instr_info.instr_type == ISA::RISCV::InstrType::CUSTOM7) //TRACE RAY { for(uint i = 0; i < sizeof(rtm::Ray) / sizeof(float); ++i) diff --git a/src/arches-v2/units/unit-sfu.hpp b/src/arches-v2/units/unit-sfu.hpp index d6fb97d..04c0f7d 100644 --- a/src/arches-v2/units/unit-sfu.hpp +++ b/src/arches-v2/units/unit-sfu.hpp @@ -54,7 +54,7 @@ class UnitSFU : public UnitBase return return_crossbar.peek(port_index); } - virtual const SFURequest& read_return(uint port_index) + virtual const SFURequest read_return(uint port_index) { return return_crossbar.read(port_index); } diff --git a/src/arches-v2/units/unit-texture.cpp b/src/arches-v2/units/unit-texture.cpp new file mode 100644 index 0000000..b79a188 --- /dev/null +++ b/src/arches-v2/units/unit-texture.cpp @@ -0,0 +1,79 @@ +#include "unit-texture.hpp" + +namespace Arches { namespace Units { + +void UnitTexture::clock_rise() +{ + _request_network.clock(); + + if(_filter_pipline.is_write_valid() && cache->return_port_read_valid(cache_port)) + { + MemoryReturn ret = cache->read_return(cache_port); + Sample& sample = _pending_samples[ret.dst.raw]; + _assert(sample.texel_addrs[0] == ret.paddr); + std::memcpy(&sample.texels[0], ret.data, sizeof(Texture2D::Texel)); + sample.pending_texels--; + + if(sample.pending_texels == 0) + _filter_pipline.write(sample); + } + + if(_request_network.is_read_valid(0)) + { + MemoryRequest req = _request_network.read(0); + req.dst.push(req.port, 8); + + Sample& sample = _pending_samples[req.dst.raw]; + sample.req = req; + + //nearest + sample.pending_texels = 1; + + rtm::vec2 uv; + std::memcpy(&uv, req.data, sizeof(rtm::vec2)); + Texture2D& texture = *(Texture2D*)&_cheat_mem[req.vaddr]; + + rtm::uvec2 iuv = texture.get_iuv(uv); + Texture2D::Texel* texel = texture.get_texel_addr(iuv); + sample.texel_addrs[0] = (paddr_t)texel; + + MemoryRequest texel_req; + texel_req.type = MemoryRequest::Type::LOAD; + texel_req.paddr = sample.texel_addrs[0]; + texel_req.size = sizeof(Texture2D::Texel); + texel_req.port = cache_port; + texel_req.dst = req.dst; + _texel_fill_queue.push(texel_req); + } + +} + +void UnitTexture::clock_fall() +{ + _filter_pipline.clock(); + + if(_filter_pipline.is_read_valid() && _return_network.is_write_valid(0)) + { + Sample sample = _filter_pipline.read(); + + MemoryReturn ret2(sample.req); + ret2.size = sizeof(rtm::vec4); + ret2.dst.pop(8); + + rtm::vec4 color = Texture2D::decode_texel(sample.texels[0]); + std::memcpy(ret2.data, &color, sizeof(rtm::vec4)); + + _return_network.write(ret2, 0); + } + + if(_texel_fill_queue.size() && cache->request_port_write_valid(cache_port)) + { + cache->write_request(_texel_fill_queue.front()); + _texel_fill_queue.pop(); + } + + _return_network.clock(); +} + +} +} \ No newline at end of file diff --git a/src/arches-v2/units/unit-texture.hpp b/src/arches-v2/units/unit-texture.hpp new file mode 100644 index 0000000..fee46b7 --- /dev/null +++ b/src/arches-v2/units/unit-texture.hpp @@ -0,0 +1,79 @@ +#pragma once +#include "stdafx.hpp" + +#include "unit-memory-base.hpp" + +#include "util/arbitration.hpp" + +namespace Arches { +namespace Units { + +class UnitTexture : public UnitMemoryBase +{ +public: + struct Configuration + { + uint num_clients{1}; + uint8_t* cheat_mem{nullptr}; + UnitMemoryBase* cache{nullptr}; + uint cache_port{0}; + }; + +private: + UnitMemoryBase* cache; + uint cache_port; + + Cascade> _request_network; + ReturnCascade _return_network; + + uint8_t* _cheat_mem; + + struct Sample + { + paddr_t texel_addrs[8]; + Texture2D::Texel texels[8]; + uint32_t pending_texels; + MemoryRequest req; + }; + + std::unordered_map _pending_samples; + std::queue _texel_fill_queue; + LatencyFIFO _filter_pipline; + +public: + UnitTexture(const Configuration& config) : UnitMemoryBase(), cache(config.cache), cache_port(config.cache_port), _cheat_mem(config.cheat_mem), + _filter_pipline(10), _request_network(config.num_clients, 1), _return_network(1, config.num_clients) + { + } + + void clock_rise() override; + void clock_fall() override; + + bool request_port_write_valid(uint port_index) override + { + return _request_network.is_write_valid(port_index); + } + + void write_request(const MemoryRequest& request) override + { + _request_network.write(request, request.port); + } + + bool return_port_read_valid(uint port_index) override + { + return _return_network.is_read_valid(port_index); + } + + const MemoryReturn& peek_return(uint port_index) override + { + return _return_network.peek(port_index); + } + + const MemoryReturn read_return(uint port_index) override + { + return _return_network.read(port_index); + } +}; + +} +} \ No newline at end of file diff --git a/src/trax-kernel/custom-instr.hpp b/src/trax-kernel/custom-instr.hpp index 455ad8e..87ab80c 100644 --- a/src/trax-kernel/custom-instr.hpp +++ b/src/trax-kernel/custom-instr.hpp @@ -5,11 +5,22 @@ static std::atomic_uint _next_thread; #endif +// uint32_t inline fchthrd() +// { +// #ifdef __riscv +// uint32_t value = 0; +// asm volatile("fchthrd %0\n\t" : "=r" (value)); +// return value; +// #else +// return _next_thread++; +// #endif +// } + uint32_t inline fchthrd() { #ifdef __riscv uint32_t value = 0; - asm volatile("fchthrd %0\n\t" : "=r" (value)); + asm volatile(".insn i 0x000b, 0, %0, x0, 0\n\t" : "=r" (value)); return value; #else return _next_thread++; @@ -31,4 +42,24 @@ inline void ebreak() "ebreak\n\t" ); #endif +} + +rtm::vec4 inline sample2d(Texture2D* texture, rtm::vec2 uv) +{ +#ifdef __riscv + uint32_t addr = (uint64_t)texture; + register float src0 asm("f0") = *(float*)&addr; + register float src1 asm("f1") = uv.x; + register float src2 asm("f2") = uv.y; + + register float dst0 asm("f28"); + register float dst1 asm("f29"); + register float dst2 asm("f30"); + register float dst3 asm("f31"); + + asm volatile(".insn i 0xb, 0x6, %0, %4, 0\n\t" : "=f" (dst0), "=f" (dst1), "=f" (dst2), "=f" (dst3) : "f" (src0), "f" (src1), "f" (src2)); + return rtm::vec4(dst0, dst1, dst2, dst3); +#else + return texture->sample(uv); +#endif } \ No newline at end of file diff --git a/src/trax-kernel/main.cpp b/src/trax-kernel/main.cpp index 4b00f8b..12b0ec0 100644 --- a/src/trax-kernel/main.cpp +++ b/src/trax-kernel/main.cpp @@ -114,18 +114,18 @@ inline static void kernel(const TRaXKernelArgs& args) rtm::uvec3 tci = args.tex_coord_indices[hit.id]; rtm::vec2 tc = args.tex_coords[tci[0]] * hit.bc[0] + args.tex_coords[tci[1]] * hit.bc[1] + args.tex_coords[tci[2]] * (1.0f - hit.bc[0] - hit.bc[1]); - rtm::vec3 albedo; + rtm::vec4 albedo; if(mat.use_am) { - albedo = mat.albedo_texture.sample(tc); + albedo = sample2d(&mat.albedo_texture, tc); } else { - albedo = rtm::vec3(1.0f, 0.0f, 1.0f); + albedo = rtm::vec4(1.0f, 0.0f, 1.0f, 1.0f); } //args.framebuffer[fb_index] = rtm::RNG::hash(mat.use_am) | 0xff000000; - args.framebuffer[fb_index] = encode_pixel(albedo); + args.framebuffer[fb_index] = encode_pixel(rtm::vec3(albedo.x, albedo.y, albedo.z)); } else { diff --git a/src/trax-kernel/riscv/kernel b/src/trax-kernel/riscv/kernel index d84f51d6fb0b36b54f5d4550793974703b3f6c9d..3a490075f011b18d04f54c0432c3b3e9d9acb8ac 100644 GIT binary patch delta 992 zcmZ{jO=uHA6vyA}>dv;+G|8m9ZBpW`9UrFkXl?MHt!))JA{Qi*s1__=oqWzA$Mb0GzJd<8H=pxV&yp4 z3Wjtt3C5t~$rYXWEz=T;HCF%%k7hJ;*9XlE&S-@PbDCNEMKfJH7+mId;P`G~asXC#8JAL*X zqnJ5gk`yfy0aIXJPd3Sqqm+-^q3W_GM7N#qL!S9ISxX92T5HX$kD+tj`=?0K2Kn zi<_V>SHj(PC`)jttIk%s2v#g&44d8gq8vG1#7e%TyIGzOhb$kfY4I6DhFD0{hjK$E zd(EuZ7;e7Fio7@kq=3ZGK=JVkb+$Ju3j59KnVCc5!*#pd{5w?bf+IEDRaYh? z{H#qmJE>oC(KhKSeJVBjEgrGVO5=ftDMCL>*H2CbD#IsOrh6@1$$K`qLn`^DElwZG z!NY59F?8)eG9z2*l`{9Yc;=UU25}P{J=w6=v)R1GvpBN|z1OhMGqusf)W+b|)AtQ} z)#ey_Qx^ktx4&_9K+gwhuV3L5AHCy0R5e+*t}>dsDARerQng=N2d7s-uV4HNQfB%1 delta 1630 zcmZ`)U1%Fe5T3pD-Kq|pPd@9U#z8(iwIxz;Q|wp<6(hLBF1U3amL{bSZ7es*V`Mvu z{bVVq(WMkj-6KkJejllfApfIS-uW0kf`Wt&Ss8g>te}$%I-M1{FK*WdO2Iq*z zC1>jr5zH&KfP8j2X65=IFEPm6)WE8A!uXsR&RowzDnej;{Ja>(`79cr^F=M6 z2AKhPxhH!QRBkXocNg&H2;jc-Zd%zyYH$F05)pv3=INJwJGMJ2t{`-g-E*u_}dZ~Nng2^0h%K= zxgy4CqRC+i$uTCrKb-j{3(WOjc@r5Ehjn%V>`?bvpqkHSZc22R&ud4eRm9qmT{w3iv2vM5 z(&XIW5t6T(#_7ak55rK7&Tiu9nN?h<+{cCNDvruO;=)`B)GuYh`rvnzml?wMx*>Bf z2${Pw&}5rKt8Gq{N6of^jun&yb#)W2}NatE~HbLgz*u&U+^ zofbFlOv^LFNwGXMD_4iBp7gD%Cu~#}V@0?8GkIkN70d96kzCp+PCu?132&JCl3{jn zI!u*Bn2HC&%#4I7><=5s<9g9OsT+x7I^(IQIls+(O3GfyWuaA*P=37x`SoS!d9)6# zguvd#N3e?kWI}1B_CG%9HfBPgEF3us3-5-Ym4zU>dP1-&Nl>RFP4-I(?cgiAR}-Y&R!u$TBYI3F zooUsWG7>{&5+15t4cu}=9?}6Vrk+lsbeW6}W94r+0djuJiHY)q*$EOT8qb-S6N zl76cEVFeHF1w5icW)RnJ3$k=BPLrBL6a3`3TqZjen$Q&nP-bpOsof;zQYlUqg%`%h z3*!&h`lN>Qz8g0er8tFw{r^?UNa!9T+36|r-7!tE>rHaO|Gz~#RsQLrHG}u;3-c>2 z!1>R=x3|j;d>|Z@bsR(25I%@Da0_ab;ovUpL2`Zt|J5|_?6SU7<%YBAbkw@7o}PCa ztT$Tr?0z63zV>;-fFYn+lPzabf#CD%H~CDZu6`bWZwK$^aXw2Q?clvH;0ca51A-0C zHL6FzWs>tV_Ft{(>hv-H))X%q`g8~H*YcnHMvzc$uzuCLz5j;wfFGb5#jJM!Yu>=l xE&@9b!X9hP9}#*r>kIz?e=e*axHzv_y8{D)Ct$rF7 100f0: 0007071b sext.w a4,a4 - 100f4: 4cf77e63 bgeu a4,a5,105d0 - 100f8: 00100793 li a5,1 + 100f4: 28f77263 bgeu a4,a5,10378 + 100f8: 00011837 lui a6,0x11 100fc: 00011537 lui a0,0x11 10100: 000115b7 lui a1,0x11 10104: 00011637 lui a2,0x11 10108: 000116b7 lui a3,0x11 - 1010c: 02079893 sll a7,a5,0x20 - 10110: 62052687 flw fa3,1568(a0) # 11620 <__DATA_BEGIN__+0x8> - 10114: 6245a707 flw fa4,1572(a1) # 11624 <__DATA_BEGIN__+0xc> - 10118: 61862587 flw fa1,1560(a2) # 11618 <__DATA_BEGIN__> - 1011c: 61c6a607 flw fa2,1564(a3) # 1161c <__DATA_BEGIN__+0x4> - 10120: 11003803 ld a6,272(zero) # 110 - 10124: 11804503 lbu a0,280(zero) # 118 - 10128: fe010113 add sp,sp,-32 - 1012c: 00f888b3 add a7,a7,a5 - 10130: 2ec0006f j 1041c - 10134: 00c16603 lwu a2,12(sp) - 10138: 1b003703 ld a4,432(zero) # 1b0 - 1013c: 1b803303 ld t1,440(zero) # 1b8 - 10140: 00261693 sll a3,a2,0x2 - 10144: 00d70733 add a4,a4,a3 - 10148: 00076583 lwu a1,0(a4) - 1014c: ffff0737 lui a4,0xffff0 - 10150: 0ff70713 add a4,a4,255 # ffffffffffff00ff <__global_pointer$+0xfffffffffffde2e7> - 10154: 00359693 sll a3,a1,0x3 - 10158: 40b686b3 sub a3,a3,a1 - 1015c: 00369693 sll a3,a3,0x3 - 10160: 00d306b3 add a3,t1,a3 - 10164: 0016c583 lbu a1,1(a3) - 10168: 0025f593 and a1,a1,2 - 1016c: 28058e63 beqz a1,10408 - 10170: 0086b583 ld a1,8(a3) - 10174: ff000737 lui a4,0xff000 - 10178: 28058863 beqz a1,10408 - 1017c: 0186b303 ld t1,24(a3) - 10180: 3b158e63 beq a1,a7,1053c - 10184: 00161593 sll a1,a2,0x1 - 10188: 19003e83 ld t4,400(zero) # 190 - 1018c: 00c58633 add a2,a1,a2 - 10190: 00261613 sll a2,a2,0x2 - 10194: 00ce8eb3 add t4,t4,a2 - 10198: 004eee03 lwu t3,4(t4) - 1019c: 1a803603 ld a2,424(zero) # 1a8 - 101a0: 01eef553 fadd.s fa0,ft9,ft10 - 101a4: 003e1e13 sll t3,t3,0x3 - 101a8: 01c60e33 add t3,a2,t3 - 101ac: 00011f37 lui t5,0x11 - 101b0: 62cf2187 flw ft3,1580(t5) # 1162c <__DATA_BEGIN__+0x14> - 101b4: 004e2787 flw fa5,4(t3) - 101b8: 008ee583 lwu a1,8(t4) - 101bc: 08a1f553 fsub.s fa0,ft3,fa0 - 101c0: 11e7f7d3 fmul.s fa5,fa5,ft10 - 101c4: 00359593 sll a1,a1,0x3 - 101c8: 00b605b3 add a1,a2,a1 - 101cc: 000eee83 lwu t4,0(t4) - 101d0: 0045a007 flw ft0,4(a1) - 101d4: 000e2087 flw ft1,0(t3) - 101d8: 003e9e13 sll t3,t4,0x3 - 101dc: 78057043 fmadd.s ft0,fa0,ft0,fa5 - 101e0: 01c60633 add a2,a2,t3 - 101e4: 00462787 flw fa5,4(a2) - 101e8: 101f7f53 fmul.s ft10,ft10,ft1 - 101ec: 0005a087 flw ft1,0(a1) - 101f0: 01d7f7c3 fmadd.s fa5,fa5,ft9,ft0 - 101f4: 00062107 flw ft2,0(a2) - 101f8: 00c6af03 lw t5,12(a3) - 101fc: f0157543 fmadd.s fa0,fa0,ft1,ft10 - 10200: 000115b7 lui a1,0x11 - 10204: c0079653 fcvt.w.s a2,fa5,rtz - 10208: 6205a207 flw ft4,1568(a1) # 11620 <__DATA_BEGIN__+0x8> - 1020c: d00f7053 fcvt.s.w ft0,t5 - 10210: d00670d3 fcvt.s.w ft1,a2 - 10214: 51d17143 fmadd.s ft2,ft2,ft9,fa0 - 10218: 0086ae83 lw t4,8(a3) - 1021c: 0817f7d3 fsub.s fa5,fa5,ft1 - 10220: 00011637 lui a2,0x11 - 10224: 62462507 flw fa0,1572(a2) # 11624 <__DATA_BEGIN__+0xc> - 10228: c00116d3 fcvt.w.s a3,ft2,rtz - 1022c: d00ef2d3 fcvt.s.w ft5,t4 - 10230: 2007f0c3 fmadd.s ft1,fa5,ft0,ft4 - 10234: d006f353 fcvt.s.w ft6,a3 - 10238: 5007f7c3 fmadd.s fa5,fa5,ft0,fa0 - 1023c: 00011637 lui a2,0x11 - 10240: 08617153 fsub.s ft2,ft2,ft6 - 10244: c01096d3 fcvt.wu.s a3,ft1,rtz - 10248: 62862007 flw ft0,1576(a2) # 11628 <__DATA_BEGIN__+0x10> - 1024c: c0179e53 fcvt.wu.s t3,fa5,rtz - 10250: 01e686bb addw a3,a3,t5 - 10254: 03e6f6bb remuw a3,a3,t5 - 10258: 50517543 fmadd.s fa0,ft2,ft5,fa0 - 1025c: 01ee0e3b addw t3,t3,t5 - 10260: 20517143 fmadd.s ft2,ft2,ft5,ft4 - 10264: c00095d3 fcvt.w.s a1,ft1,rtz - 10268: 00812e27 fsw fs0,28(sp) - 1026c: c0151653 fcvt.wu.s a2,fa0,rtz - 10270: d005f7d3 fcvt.s.w fa5,a1 - 10274: c01115d3 fcvt.wu.s a1,ft2,rtz - 10278: 01d6063b addw a2,a2,t4 - 1027c: c0011fd3 fcvt.w.s t6,ft2,rtz - 10280: 01d585bb addw a1,a1,t4 - 10284: 08f0f0d3 fsub.s ft1,ft1,fa5 - 10288: d00ff7d3 fcvt.s.w fa5,t6 - 1028c: 00011fb7 lui t6,0x11 - 10290: 630fa307 flw ft6,1584(t6) # 11630 <__DATA_BEGIN__+0x18> - 10294: 08f17153 fsub.s ft2,ft2,fa5 - 10298: f00003d3 fmv.w.x ft7,zero - 1029c: 03ee7e3b remuw t3,t3,t5 - 102a0: 03d6763b remuw a2,a2,t4 - 102a4: 03d5f5bb remuw a1,a1,t4 - 102a8: 03d686bb mulw a3,a3,t4 - 102ac: 03de0e3b mulw t3,t3,t4 - 102b0: 00c68ebb addw t4,a3,a2 - 102b4: 020e9f13 sll t5,t4,0x20 - 102b8: 01ef5e93 srl t4,t5,0x1e - 102bc: 01d30eb3 add t4,t1,t4 - 102c0: 002ecf03 lbu t5,2(t4) - 102c4: 001ecf83 lbu t6,1(t4) - 102c8: 000ece83 lbu t4,0(t4) - 102cc: d01f7553 fcvt.s.wu fa0,t5 - 102d0: d01ff853 fcvt.s.wu fa6,t6 - 102d4: 01c6063b addw a2,a2,t3 - 102d8: d01ef2d3 fcvt.s.wu ft5,t4 - 102dc: 02061e93 sll t4,a2,0x20 - 102e0: 01eed613 srl a2,t4,0x1e - 102e4: 00c30633 add a2,t1,a2 - 102e8: 00264e83 lbu t4,2(a2) - 102ec: 00164f03 lbu t5,1(a2) - 102f0: 10087853 fmul.s fa6,fa6,ft0 - 102f4: 10057553 fmul.s fa0,fa0,ft0 - 102f8: 00064603 lbu a2,0(a2) - 102fc: d01effd3 fcvt.s.wu ft11,t4 - 10300: 1002f2d3 fmul.s ft5,ft5,ft0 - 10304: d01f7ed3 fcvt.s.wu ft9,t5 - 10308: d01678d3 fcvt.s.wu fa7,a2 - 1030c: 500ff54b fnmsub.s fa0,ft11,ft0,fa0 - 10310: 800ef84b fnmsub.s fa6,ft9,ft0,fa6 - 10314: 00b686bb addw a3,a3,a1 - 10318: 02069613 sll a2,a3,0x20 - 1031c: 2808f2cb fnmsub.s ft5,fa7,ft0,ft5 - 10320: 01e65693 srl a3,a2,0x1e - 10324: 00d306b3 add a3,t1,a3 - 10328: 0026c603 lbu a2,2(a3) - 1032c: 0016ce83 lbu t4,1(a3) - 10330: 1100f853 fmul.s fa6,ft1,fa6 - 10334: 10a0f553 fmul.s fa0,ft1,fa0 - 10338: 01c585bb addw a1,a1,t3 - 1033c: 0006ce03 lbu t3,0(a3) - 10340: 02059693 sll a3,a1,0x20 - 10344: 1050f2d3 fmul.s ft5,ft1,ft5 - 10348: d01677d3 fcvt.s.wu fa5,a2 - 1034c: 01e6d593 srl a1,a3,0x1e - 10350: d01eff53 fcvt.s.wu ft10,t4 - 10354: 00b306b3 add a3,t1,a1 - 10358: 0016c583 lbu a1,1(a3) - 1035c: 0026c603 lbu a2,2(a3) - 10360: d01e7e53 fcvt.s.wu ft8,t3 - 10364: 800efec3 fmadd.s ft9,ft9,ft0,fa6 - 10368: 500fffc3 fmadd.s ft11,ft11,ft0,fa0 - 1036c: 100f7f53 fmul.s ft10,ft10,ft0 - 10370: 1007f7d3 fmul.s fa5,fa5,ft0 - 10374: 0006c683 lbu a3,0(a3) - 10378: 100e7853 fmul.s fa6,ft8,ft0 - 1037c: 2808f2c3 fmadd.s ft5,fa7,ft0,ft5 - 10380: d015f553 fcvt.s.wu fa0,a1 - 10384: d0167e53 fcvt.s.wu ft8,a2 - 10388: d016f8d3 fcvt.s.wu fa7,a3 - 1038c: e8057447 fmsub.s fs0,fa0,ft0,ft9 - 10390: 780e77cb fnmsub.s fa5,ft8,ft0,fa5 - 10394: f005754b fnmsub.s fa0,fa0,ft0,ft10 - 10398: f80e7e47 fmsub.s ft8,ft8,ft0,ft11 - 1039c: 8008f84b fnmsub.s fa6,fa7,ft0,fa6 - 103a0: 2808f8c7 fmsub.s fa7,fa7,ft0,ft5 - 103a4: 40a0f543 fmadd.s fa0,ft1,fa0,fs0 - 103a8: e0f0f7c3 fmadd.s fa5,ft1,fa5,ft8 - 103ac: 01c12407 flw fs0,28(sp) - 103b0: 8900f0c3 fmadd.s ft1,ft1,fa6,fa7 - 103b4: e8a17543 fmadd.s fa0,ft2,fa0,ft9 - 103b8: f8f177c3 fmadd.s fa5,ft2,fa5,ft11 - 103bc: 28117043 fmadd.s ft0,ft2,ft1,ft5 - 103c0: 28751553 fmax.s fa0,fa0,ft7 - 103c4: 287797d3 fmax.s fa5,fa5,ft7 - 103c8: 28701053 fmax.s ft0,ft0,ft7 - 103cc: 28350553 fmin.s fa0,fa0,ft3 - 103d0: 283787d3 fmin.s fa5,fa5,ft3 - 103d4: 28300053 fmin.s ft0,ft0,ft3 - 103d8: 20657543 fmadd.s fa0,fa0,ft6,ft4 - 103dc: 2067f7c3 fmadd.s fa5,fa5,ft6,ft4 - 103e0: 20607043 fmadd.s ft0,ft0,ft6,ft4 - 103e4: c01516d3 fcvt.wu.s a3,fa0,rtz - 103e8: c0179653 fcvt.wu.s a2,fa5,rtz - 103ec: c01015d3 fcvt.wu.s a1,ft0,rtz - 103f0: 0086969b sllw a3,a3,0x8 - 103f4: 0106161b sllw a2,a2,0x10 - 103f8: 00c6e6b3 or a3,a3,a2 - 103fc: 00b6e6b3 or a3,a3,a1 - 10400: 00e6e733 or a4,a3,a4 - 10404: 0007071b sext.w a4,a4 - 10408: 00e7a023 sw a4,0(a5) - 1040c: 0000070b fchthrd a4 - 10410: 10802783 lw a5,264(zero) # 108 - 10414: 0007071b sext.w a4,a4 - 10418: 0af77263 bgeu a4,a5,104bc - 1041c: 10002783 lw a5,256(zero) # 100 - 10420: 0057569b srlw a3,a4,0x5 - 10424: 0027559b srlw a1,a4,0x2 - 10428: 0027d31b srlw t1,a5,0x2 - 1042c: 0266d63b divuw a2,a3,t1 - 10430: 0075f593 and a1,a1,7 - 10434: 00377713 and a4,a4,3 - 10438: 0266f6bb remuw a3,a3,t1 - 1043c: 0036161b sllw a2,a2,0x3 - 10440: 00b6063b addw a2,a2,a1 - 10444: 02c787bb mulw a5,a5,a2 - 10448: 0026969b sllw a3,a3,0x2 - 1044c: 00e686bb addw a3,a3,a4 - 10450: 00d787bb addw a5,a5,a3 - 10454: 02079793 sll a5,a5,0x20 - 10458: 0207d793 srl a5,a5,0x20 - 1045c: 06050663 beqz a0,104c8 - 10460: 16803703 ld a4,360(zero) # 168 - 10464: 00579693 sll a3,a5,0x5 - 10468: 00d70733 add a4,a4,a3 - 1046c: 00072007 flw ft0,0(a4) # ffffffffff000000 <__global_pointer$+0xfffffffffefee1e8> - 10470: 00472087 flw ft1,4(a4) - 10474: 00872107 flw ft2,8(a4) - 10478: 00c72187 flw ft3,12(a4) - 1047c: 01072207 flw ft4,16(a4) - 10480: 01472287 flw ft5,20(a4) - 10484: 01872307 flw ft6,24(a4) - 10488: 01c72387 flw ft7,28(a4) - 1048c: 00005e0b traceray ft8,ft0,0 - 10490: a1c38753 fle.s a4,ft7,ft8 - 10494: 00279793 sll a5,a5,0x2 - 10498: 01f12627 fsw ft11,12(sp) - 1049c: 00f807b3 add a5,a6,a5 - 104a0: c8070ae3 beqz a4,10134 - 104a4: ff000737 lui a4,0xff000 - 104a8: 00e7a023 sw a4,0(a5) - 104ac: 0000070b fchthrd a4 - 104b0: 10802783 lw a5,264(zero) # 108 - 104b4: 0007071b sext.w a4,a4 - 104b8: f6f762e3 bltu a4,a5,1041c - 104bc: 00000513 li a0,0 - 104c0: 02010113 add sp,sp,32 - 104c4: 00008067 ret - 104c8: d006f7d3 fcvt.s.w fa5,a3 - 104cc: 12002507 flw fa0,288(zero) # 120 - 104d0: d0067353 fcvt.s.w ft6,a2 - 104d4: 00d7f7d3 fadd.s fa5,fa5,fa3 - 104d8: 14c02287 flw ft5,332(zero) # 14c - 104dc: 00d37353 fadd.s ft6,ft6,fa3 - 104e0: 12402207 flw ft4,292(zero) # 124 - 104e4: 15002187 flw ft3,336(zero) # 150 - 104e8: 70a7f7c3 fmadd.s fa5,fa5,fa0,fa4 - 104ec: 13c02107 flw ft2,316(zero) # 13c - 104f0: 15402087 flw ft1,340(zero) # 154 - 104f4: 13402007 flw ft0,308(zero) # 134 - 104f8: 13802507 flw fa0,312(zero) # 138 - 104fc: 70437343 fmadd.s ft6,ft6,ft4,fa4 - 10500: 2807f047 fmsub.s ft0,fa5,ft0,ft5 - 10504: 18a7f547 fmsub.s fa0,fa5,fa0,ft3 - 10508: 0827f7c7 fmsub.s fa5,fa5,ft2,ft1 - 1050c: 14002207 flw ft4,320(zero) # 140 - 10510: 14802087 flw ft1,328(zero) # 148 - 10514: 14402287 flw ft5,324(zero) # 144 - 10518: 00437243 fmadd.s ft4,ft6,ft4,ft0 - 1051c: 13002107 flw ft2,304(zero) # 130 - 10520: 505372c3 fmadd.s ft5,ft6,ft5,fa0 - 10524: 12802007 flw ft0,296(zero) # 128 - 10528: 78137343 fmadd.s ft6,ft6,ft1,fa5 - 1052c: 20b583d3 fmv.s ft7,fa1 - 10530: 12c02087 flw ft1,300(zero) # 12c - 10534: 20c601d3 fmv.s ft3,fa2 - 10538: f55ff06f j 1048c - 1053c: 00234603 lbu a2,2(t1) - 10540: 00134e03 lbu t3,1(t1) - 10544: 00034683 lbu a3,0(t1) - 10548: 000115b7 lui a1,0x11 - 1054c: 6285a107 flw ft2,1576(a1) # 11628 <__DATA_BEGIN__+0x10> - 10550: d0167553 fcvt.s.wu fa0,a2 - 10554: d01e7053 fcvt.s.wu ft0,t3 - 10558: d016f7d3 fcvt.s.wu fa5,a3 - 1055c: 10257553 fmul.s fa0,fa0,ft2 - 10560: 10207053 fmul.s ft0,ft0,ft2 - 10564: f00000d3 fmv.w.x ft1,zero - 10568: 1027f7d3 fmul.s fa5,fa5,ft2 - 1056c: 000116b7 lui a3,0x11 - 10570: 62c6a187 flw ft3,1580(a3) # 1162c <__DATA_BEGIN__+0x14> - 10574: 28101053 fmax.s ft0,ft0,ft1 - 10578: 28151553 fmax.s fa0,fa0,ft1 - 1057c: 281797d3 fmax.s fa5,fa5,ft1 - 10580: 00011637 lui a2,0x11 - 10584: 000116b7 lui a3,0x11 - 10588: 63062107 flw ft2,1584(a2) # 11630 <__DATA_BEGIN__+0x18> - 1058c: 6206a087 flw ft1,1568(a3) # 11620 <__DATA_BEGIN__+0x8> - 10590: 28300053 fmin.s ft0,ft0,ft3 - 10594: 28350553 fmin.s fa0,fa0,ft3 - 10598: 283787d3 fmin.s fa5,fa5,ft3 - 1059c: 08207043 fmadd.s ft0,ft0,ft2,ft1 - 105a0: 08257543 fmadd.s fa0,fa0,ft2,ft1 - 105a4: 0827f7c3 fmadd.s fa5,fa5,ft2,ft1 - 105a8: c01016d3 fcvt.wu.s a3,ft0,rtz - 105ac: c0151653 fcvt.wu.s a2,fa0,rtz - 105b0: c01795d3 fcvt.wu.s a1,fa5,rtz - 105b4: 0086969b sllw a3,a3,0x8 - 105b8: 0106161b sllw a2,a2,0x10 - 105bc: 00c6e6b3 or a3,a3,a2 - 105c0: 00b6e6b3 or a3,a3,a1 - 105c4: 00e6e733 or a4,a3,a4 - 105c8: 0007071b sext.w a4,a4 - 105cc: e3dff06f j 10408 - 105d0: 00000513 li a0,0 - 105d4: 00008067 ret + 1010c: 000117b7 lui a5,0x11 + 10110: 3c082707 flw fa4,960(a6) # 113c0 <__DATA_BEGIN__+0x8> + 10114: 3c452507 flw fa0,964(a0) # 113c4 <__DATA_BEGIN__+0xc> + 10118: 3b85a887 flw fa7,952(a1) # 113b8 <__DATA_BEGIN__> + 1011c: 3bc62807 flw fa6,956(a2) # 113bc <__DATA_BEGIN__+0x4> + 10120: 3c86a687 flw fa3,968(a3) # 113c8 <__DATA_BEGIN__+0x10> + 10124: 3cc7a587 flw fa1,972(a5) # 113cc <__DATA_BEGIN__+0x14> + 10128: 11003503 ld a0,272(zero) # 110 + 1012c: 11804583 lbu a1,280(zero) # 118 + 10130: f0000653 fmv.w.x fa2,zero + 10134: ff010113 add sp,sp,-16 + 10138: ff000837 lui a6,0xff000 + 1013c: 1200006f j 1025c + 10140: 00c16603 lwu a2,12(sp) + 10144: 1b003703 ld a4,432(zero) # 1b0 + 10148: 1b803883 ld a7,440(zero) # 1b8 + 1014c: 00261693 sll a3,a2,0x2 + 10150: 00d70733 add a4,a4,a3 + 10154: 00076303 lwu t1,0(a4) + 10158: ffff0737 lui a4,0xffff0 + 1015c: 0ff70713 add a4,a4,255 # ffffffffffff00ff <__global_pointer$+0xfffffffffffde547> + 10160: 00331693 sll a3,t1,0x3 + 10164: 406686b3 sub a3,a3,t1 + 10168: 00369693 sll a3,a3,0x3 + 1016c: 00d886b3 add a3,a7,a3 + 10170: 0016c883 lbu a7,1(a3) + 10174: 0028f893 and a7,a7,2 + 10178: 0c088663 beqz a7,10244 + 1017c: 19003303 ld t1,400(zero) # 190 + 10180: 00161713 sll a4,a2,0x1 + 10184: 00c70733 add a4,a4,a2 + 10188: 00271713 sll a4,a4,0x2 + 1018c: 00e30333 add t1,t1,a4 + 10190: 00436883 lwu a7,4(t1) + 10194: 1a803703 ld a4,424(zero) # 1a8 + 10198: 01eef153 fadd.s ft2,ft9,ft10 + 1019c: 00389893 sll a7,a7,0x3 + 101a0: 011708b3 add a7,a4,a7 + 101a4: 0008a087 flw ft1,0(a7) + 101a8: 0048a007 flw ft0,4(a7) + 101ac: 00836603 lwu a2,8(t1) + 101b0: 11e0f0d3 fmul.s ft1,ft1,ft10 + 101b4: 0826f153 fsub.s ft2,fa3,ft2 + 101b8: 11e07053 fmul.s ft0,ft0,ft10 + 101bc: 00361613 sll a2,a2,0x3 + 101c0: 00c70633 add a2,a4,a2 + 101c4: 00062787 flw fa5,0(a2) + 101c8: 00462187 flw ft3,4(a2) + 101cc: 00036883 lwu a7,0(t1) + 101d0: 08f177c3 fmadd.s fa5,ft2,fa5,ft1 + 101d4: 00317143 fmadd.s ft2,ft2,ft3,ft0 + 101d8: 00389613 sll a2,a7,0x3 + 101dc: 00c70733 add a4,a4,a2 + 101e0: 00472007 flw ft0,4(a4) + 101e4: 00072087 flw ft1,0(a4) + 101e8: 0086869b addw a3,a3,8 + 101ec: 11d07143 fmadd.s ft2,ft0,ft9,ft2 + 101f0: 79d0f0c3 fmadd.s ft1,ft1,ft9,fa5 + 101f4: f0068053 fmv.w.x ft0,a3 + 101f8: 00006e0b .insn 4, 0x6e0b + 101fc: 28ce9ed3 fmax.s ft9,ft9,fa2 + 10200: 28cf1f53 fmax.s ft10,ft10,fa2 + 10204: 28ce1e53 fmax.s ft8,ft8,fa2 + 10208: 28de8ed3 fmin.s ft9,ft9,fa3 + 1020c: 28df0f53 fmin.s ft10,ft10,fa3 + 10210: 28de0e53 fmin.s ft8,ft8,fa3 + 10214: 70befec3 fmadd.s ft9,ft9,fa1,fa4 + 10218: 70bf7f43 fmadd.s ft10,ft10,fa1,fa4 + 1021c: 70be7e43 fmadd.s ft8,ft8,fa1,fa4 + 10220: c01e9753 fcvt.wu.s a4,ft9,rtz + 10224: c01f16d3 fcvt.wu.s a3,ft10,rtz + 10228: c01e1653 fcvt.wu.s a2,ft8,rtz + 1022c: 0087171b sllw a4,a4,0x8 + 10230: 0106969b sllw a3,a3,0x10 + 10234: 00d76733 or a4,a4,a3 + 10238: 00c76733 or a4,a4,a2 + 1023c: 01076733 or a4,a4,a6 + 10240: 0007071b sext.w a4,a4 + 10244: 00f507b3 add a5,a0,a5 + 10248: 00e7a023 sw a4,0(a5) + 1024c: 0000070b fchthrd a4 + 10250: 10802783 lw a5,264(zero) # 108 + 10254: 0007071b sext.w a4,a4 + 10258: 0af77063 bgeu a4,a5,102f8 + 1025c: 10002783 lw a5,256(zero) # 100 + 10260: 0057569b srlw a3,a4,0x5 + 10264: 0027589b srlw a7,a4,0x2 + 10268: 0027d31b srlw t1,a5,0x2 + 1026c: 0266d63b divuw a2,a3,t1 + 10270: 0078f893 and a7,a7,7 + 10274: 00377713 and a4,a4,3 + 10278: 0266f6bb remuw a3,a3,t1 + 1027c: 0036161b sllw a2,a2,0x3 + 10280: 0116063b addw a2,a2,a7 + 10284: 02c787bb mulw a5,a5,a2 + 10288: 0026969b sllw a3,a3,0x2 + 1028c: 00e686bb addw a3,a3,a4 + 10290: 00d787bb addw a5,a5,a3 + 10294: 02079793 sll a5,a5,0x20 + 10298: 0207d793 srl a5,a5,0x20 + 1029c: 06058463 beqz a1,10304 + 102a0: 16803703 ld a4,360(zero) # 168 + 102a4: 00579693 sll a3,a5,0x5 + 102a8: 00d70733 add a4,a4,a3 + 102ac: 00072007 flw ft0,0(a4) + 102b0: 00472087 flw ft1,4(a4) + 102b4: 00872107 flw ft2,8(a4) + 102b8: 00c72187 flw ft3,12(a4) + 102bc: 01072207 flw ft4,16(a4) + 102c0: 01472287 flw ft5,20(a4) + 102c4: 01872307 flw ft6,24(a4) + 102c8: 01c72387 flw ft7,28(a4) + 102cc: 00005e0b traceray ft8,ft0,0 + 102d0: a1c38753 fle.s a4,ft7,ft8 + 102d4: 01f12627 fsw ft11,12(sp) + 102d8: 00279793 sll a5,a5,0x2 + 102dc: e60702e3 beqz a4,10140 + 102e0: 00f507b3 add a5,a0,a5 + 102e4: 0107a023 sw a6,0(a5) + 102e8: 0000070b fchthrd a4 + 102ec: 10802783 lw a5,264(zero) # 108 + 102f0: 0007071b sext.w a4,a4 + 102f4: f6f764e3 bltu a4,a5,1025c + 102f8: 00000513 li a0,0 + 102fc: 01010113 add sp,sp,16 + 10300: 00008067 ret + 10304: d006f7d3 fcvt.s.w fa5,a3 + 10308: 12002007 flw ft0,288(zero) # 120 + 1030c: d0067353 fcvt.s.w ft6,a2 + 10310: 00e7f7d3 fadd.s fa5,fa5,fa4 + 10314: 14c02387 flw ft7,332(zero) # 14c + 10318: 00e37353 fadd.s ft6,ft6,fa4 + 1031c: 12402287 flw ft5,292(zero) # 124 + 10320: 15002207 flw ft4,336(zero) # 150 + 10324: 5007f7c3 fmadd.s fa5,fa5,ft0,fa0 + 10328: 13c02187 flw ft3,316(zero) # 13c + 1032c: 15402107 flw ft2,340(zero) # 154 + 10330: 13402087 flw ft1,308(zero) # 134 + 10334: 13802007 flw ft0,312(zero) # 138 + 10338: 50537343 fmadd.s ft6,ft6,ft5,fa0 + 1033c: 3817f0c7 fmsub.s ft1,fa5,ft1,ft7 + 10340: 2007f047 fmsub.s ft0,fa5,ft0,ft4 + 10344: 1037f7c7 fmsub.s fa5,fa5,ft3,ft2 + 10348: 14002207 flw ft4,320(zero) # 140 + 1034c: 14802107 flw ft2,328(zero) # 148 + 10350: 14402287 flw ft5,324(zero) # 144 + 10354: 08437243 fmadd.s ft4,ft6,ft4,ft1 + 10358: 211883d3 fmv.s ft7,fa7 + 1035c: 005372c3 fmadd.s ft5,ft6,ft5,ft0 + 10360: 12c02087 flw ft1,300(zero) # 12c + 10364: 78237343 fmadd.s ft6,ft6,ft2,fa5 + 10368: 12802007 flw ft0,296(zero) # 128 + 1036c: 13002107 flw ft2,304(zero) # 130 + 10370: 210801d3 fmv.s ft3,fa6 + 10374: f59ff06f j 102cc + 10378: 00000513 li a0,0 + 1037c: 00008067 ret From 4856fe9880ded2713b711af74734e0e41898fb7a Mon Sep 17 00:00:00 2001 From: KaivalyaD Date: Sun, 22 Mar 2026 05:46:53 +0530 Subject: [PATCH 5/8] Cleanup --- include/rtm/texture.hpp | 76 +++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 48 deletions(-) diff --git a/include/rtm/texture.hpp b/include/rtm/texture.hpp index 40888cf..f40efea 100644 --- a/include/rtm/texture.hpp +++ b/include/rtm/texture.hpp @@ -22,12 +22,12 @@ class Texture2D Texel* texels; private: - // Used by Arches to shallow copy textures - // Not to be used by kernels!!! - uint32_t *pTexRefCount; + // allows Arches to shallow copy textures + // not to be used by kernels!!! + int32_t *pRefCount; public: - Texture2D() : texels(nullptr), pTexRefCount(nullptr) {}; + Texture2D() : texels(nullptr), pRefCount(nullptr) {}; #ifndef __riscv Texture2D(std::string filename) { @@ -45,66 +45,47 @@ class Texture2D stbi_image_free(data); - pTexRefCount = (uint32_t *)malloc(sizeof(uint32_t)); - *pTexRefCount = 1; // texture acquired first time + pRefCount = (int32_t *)malloc(sizeof(int32_t)); + *pRefCount = 1; // acquired first time by direct assignment printf("Loaded: %s \n", filename.c_str()); } else { texels = nullptr; - pTexRefCount = nullptr; + pRefCount = nullptr; printf("Failed: %s \n", filename.c_str()); } } - // Texture2D(const Texture2D& other) - // { - // memcpy(this, &other, sizeof(Texture2D)); - // uint32_t size = sizeof(Texel) * width * height; - // texels = (Texel*)malloc(size); - // memcpy(texels, other.texels, size); - // } - Texture2D(const Texture2D& other) : width(other.width), height(other.height), comp(other.comp), pTexRefCount(other.pTexRefCount) + Texture2D(const Texture2D& other) : width(other.width), height(other.height), comp(other.comp) { + other.acquire(); texels = other.texels; - if(pTexRefCount) - acquire_texture_resource(); + pRefCount = other.pRefCount; } - // Texture2D& operator=(const Texture2D& other) - // { - // if(texels) free(texels); - // memcpy(this, &other, sizeof(Texture2D)); - // uint32_t size = sizeof(Texel) * width * height; - // texels = (Texel*)malloc(size); - // memcpy(texels, other.texels, size); - // return *this; - // } Texture2D& operator=(const Texture2D& other) { if (this == &other) return *this; - - // acquire ownership of other's resource - // this is safer than lose-first-acquire-next - if(other.pTexRefCount) - ++(*other.pTexRefCount); - - release_texture_resource(); // lose previous ownership - // assign other to this + // copy other members width = other.width; height = other.height; comp = other.comp; - texels = other.texels; - pTexRefCount = other.pTexRefCount; + + // following order is safer than 'first release, then acquire' + other.acquire(); // prebook ownership of new resource + release(); // release ownership of current resource + texels = other.texels; // acquire ownership of new resource + pRefCount = other.pRefCount; return *this; } ~Texture2D() { - release_texture_resource(); + release(); } #endif @@ -141,29 +122,28 @@ class Texture2D #ifndef __riscv private: - void acquire_texture_resource(void) + void acquire(void) const { - if (!pTexRefCount) + if (!pRefCount) return; - (*pTexRefCount)++; + (*pRefCount)++; } - void release_texture_resource(void) + void release(void) { - if (!pTexRefCount) + if (!pRefCount) return; - (*pTexRefCount)--; - - if((*pTexRefCount) <= 0) + (*pRefCount)--; + if((*pRefCount) <= 0) { if(texels) free(texels); - - free(pTexRefCount); texels = nullptr; - pTexRefCount = nullptr; + + free(pRefCount); + pRefCount = nullptr; } } #endif From 458a456eedf3a3170d09b5c09ce42f4a392d11d5 Mon Sep 17 00:00:00 2001 From: KaivalyaD Date: Tue, 24 Mar 2026 20:34:12 +0530 Subject: [PATCH 6/8] Linux specifics are under flags --- CMakeLists.txt | 1 - include/rtm/bvh.hpp | 3 ++- include/rtm/float.hpp | 19 ++++++++----- include/rtm/macros.hpp | 14 ++++++++++ src/arches-v2/CMakeLists.txt | 17 +++++++----- src/arches-v2/isa/riscv.cpp | 9 ++++--- src/arches-v2/shared-utils.hpp | 12 ++++++--- src/arches-v2/stdafx.hpp | 31 +++++++++++++++------- src/arches-v2/util/alignment-allocator.hpp | 14 +++++++--- src/arches-v2/util/bit-manipulation.hpp | 15 ++++++++--- src/arches-v2/util/stbi.cpp | 5 +++- src/dual-streaming-kernel/CMakeLists.txt | 3 --- src/strata-rt-kernel/CMakeLists.txt | 3 --- src/trax-kernel/CMakeLists.txt | 8 +++--- 14 files changed, 106 insertions(+), 48 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f1e772a..1989187 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,6 @@ project(arches-v2 LANGUAGES CXX) set_property(GLOBAL PROPERTY USE_FOLDERS ON) set(CMAKE_CXX_STANDARD 20) -set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) add_subdirectory(external) diff --git a/include/rtm/bvh.hpp b/include/rtm/bvh.hpp index 34c35fa..e18ecbd 100644 --- a/include/rtm/bvh.hpp +++ b/include/rtm/bvh.hpp @@ -1,5 +1,6 @@ #pragma once +#include "macros.hpp" #include "int.hpp" #include "aabb.hpp" #include "mesh.hpp" @@ -718,7 +719,7 @@ class BVH cache.decisions[j] = cache.decisions[j - 1]; } - // if(cache.decisions[1].cost == INFINITY) __debugbreak(); + if(cache.decisions[1].cost == INFINITY) add_breakpoint(); } struct CostItem diff --git a/include/rtm/float.hpp b/include/rtm/float.hpp index fba4dc3..b05785f 100644 --- a/include/rtm/float.hpp +++ b/include/rtm/float.hpp @@ -5,13 +5,20 @@ #ifndef __riscv #include -// #include -#include -extern __m128 _mm_cos_ps(__m128 __A); // Defined but not declared in immintrin.h -extern __m128 _mm_sin_ps(__m128 __A); // Defined but not declared in immintrin.h + #if defined BUILD_PLATFORM_WINDOWS + #include + #elif defined BUILD_PLATFORM_LINUX + #include + #endif #include #endif +#ifdef BUILD_PLATFORM_LINUX + // Declarations of intrinsics used that are defined but not declared in immintrin + extern __m128 _mm_cos_ps(__m128 __A); + extern __m128 _mm_sin_ps(__m128 __A); +#endif + namespace rtm { @@ -160,7 +167,7 @@ inline int32_t f32_to_i24(float f32, uint8_t max_exp = 127, int rounding = 0) else if(rounding == 1) norm = std::ceil(norm); - // if(norm > ((1 << 23) - 1) || norm < -(1 << 23)) __debugbreak(); + if(norm > ((1 << 23) - 1) || norm < -(1 << 23)) add_breakpoint(); return (int32_t)norm; } @@ -188,7 +195,7 @@ inline uint16_t f32_to_i16(float f32, uint8_t max_exp = 127, int rounding = 0) else if(rounding == 1) norm = std::ceil(norm); - // if(norm > ((1 << 15) - 1) || norm < -(1 << 15)) __debugbreak(); + if(norm > ((1 << 15) - 1) || norm < -(1 << 15)) add_breakpoint(); return (int16_t)norm; } #endif diff --git a/include/rtm/macros.hpp b/include/rtm/macros.hpp index 1096482..9844b51 100644 --- a/include/rtm/macros.hpp +++ b/include/rtm/macros.hpp @@ -2,4 +2,18 @@ #if defined __x86_64__ || defined _M_X64 #define __x86 +#endif + +#if defined _WIN16 || defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64 || defined __WIN32__ || defined __TOS_WIN__ || defined __WINDOWS__ + #define BUILD_PLATFORM_WINDOWS +#elif defined __linux__ + #define BUILD_PLATFORM_LINUX +#endif + +//To add breakpoints for debugging at runtime +#if defined BUILD_PLATFORM_WINDOWS + #define add_breakpoint() __debugbreak() +#elif defined BUILD_PLATFORM_LINUX + #include + #define add_breakpoint() raise(SIGINT) #endif \ No newline at end of file diff --git a/src/arches-v2/CMakeLists.txt b/src/arches-v2/CMakeLists.txt index 90125e7..e2067c8 100644 --- a/src/arches-v2/CMakeLists.txt +++ b/src/arches-v2/CMakeLists.txt @@ -1,11 +1,13 @@ cmake_minimum_required(VERSION 3.14) add_compile_definitions(UNICODE _UNICODE) -# Enable intrinsics like _lzcnt_u64 - find this out -add_compile_options(-march=native) - set(PROJECT_NAME "arches-v2") +if(LINUX) + # enable intrinsics like _lzcnt_u64 + add_compile_options(-march=native) +endif() + file(GLOB_RECURSE ALL_INCLUDE CONFIGURE_DEPENDS "*.hpp" "*.h") file(GLOB_RECURSE ALL_SOURCES CONFIGURE_DEPENDS "*.cpp" "*.cc") file(GLOB_RECURSE USIMM_CONFIG CONFIGURE_DEPENDS "*.vi" "*.cfg") @@ -56,9 +58,12 @@ target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/src) set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT DISABLE) set_target_properties(${PROJECT_NAME} PROPERTIES OUTPUT_NAME ${PROJECT_NAME}) -# target_link_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/libraries/tbb) -# target_link_libraries(${PROJECT_NAME} PRIVATE tbb12.lib) -target_link_libraries(${PROJECT_NAME} PRIVATE tbb) +if(WINDOWS) + target_link_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/libraries/tbb) + target_link_libraries(${PROJECT_NAME} PRIVATE tbb12.lib) +elseif(LINUX) + target_link_libraries(${PROJECT_NAME} PRIVATE tbb) +endif() target_link_libraries(${PROJECT_NAME} PRIVATE Ramulator) #set_target_properties(${PROJECT_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG ${CMAKE_CURRENT_BINARY_DIR}) #set_target_properties(${PROJECT_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/src/arches-v2/isa/riscv.cpp b/src/arches-v2/isa/riscv.cpp index e4e7b82..82e00fb 100644 --- a/src/arches-v2/isa/riscv.cpp +++ b/src/arches-v2/isa/riscv.cpp @@ -1,7 +1,10 @@ #include "riscv.hpp" -// #include -#include +#if defined BUILD_PLATFORM_WINDOWS + #include +#elif defined BUILD_PLATFORM_LINUX + #include +#endif #include "errors.hpp" #include "util/bit-manipulation.hpp" @@ -204,7 +207,7 @@ InstructionInfo const isa_SYSTEM[2] = InstructionInfo(0b000000000001, "ebreak", InstrType::SYS, Encoding::I, RegFile::INT, EXEC_DECL { //break point - // __debugbreak(); + add_breakpoint(); }), }; diff --git a/src/arches-v2/shared-utils.hpp b/src/arches-v2/shared-utils.hpp index fa688af..6ce973c 100644 --- a/src/arches-v2/shared-utils.hpp +++ b/src/arches-v2/shared-utils.hpp @@ -17,10 +17,14 @@ #include "isa/riscv.hpp" #include "rtm/rtm.hpp" -// #include -#include - -char full_exe_name[FILENAME_MAX]; +#if defined BUILD_PLATFORM_WINDOWS + #define MAX_FILENAME_LENGTH MAX_PATH +#elif defined BUILD_PLATFORM_LINUX + #include + #define MAX_FILENAME_LENGTH FILENAME_MAX +#endif + +char full_exe_name[MAX_FILENAME_LENGTH]; namespace Arches { void set_full_exe_name(const char *name) { diff --git a/src/arches-v2/stdafx.hpp b/src/arches-v2/stdafx.hpp index 73d7da4..4e08865 100644 --- a/src/arches-v2/stdafx.hpp +++ b/src/arches-v2/stdafx.hpp @@ -1,8 +1,14 @@ #pragma once -//Determine debug/release +//Determine platform #if defined _WIN16 || defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64 || defined __WIN32__ || defined __TOS_WIN__ || defined __WINDOWS__ #define BUILD_PLATFORM_WINDOWS +#elif defined __linux__ + #define BUILD_PLATFORM_LINUX +#endif + +//Determine debug/release +#ifdef BUILD_PLATFORM_WINDOWS #if defined _DEBUG || defined DEBUG #define BUILD_DEBUG @@ -108,6 +114,14 @@ #endif #define notimpl implerr +//To add breakpoints for debugging at runtime +#if defined BUILD_PLATFORM_WINDOWS + #define add_breakpoint() __debugbreak() +#elif defined BUILD_PLATFORM_LINUX + #include + #define add_breakpoint() raise(SIGINT) +#endif + //Language facts #if -2>>1 == -1 #define SHIFTRIGHT_ARITHMETIC //Pads with sign bit @@ -142,18 +156,17 @@ #include #include -// #include -// #include -#include - - - +//Include platform intrinsics header +#if defined BUILD_PLATFORM_WINDOWS + #include +#elif defined BUILD_PLATFORM_LINUX + #include +#endif #ifndef _DEBUG inline void _assert(bool x) { - // if(!x) __debugbreak(); - assert(x); + if(!x) add_breakpoint(); } #else inline void _assert(bool x) diff --git a/src/arches-v2/util/alignment-allocator.hpp b/src/arches-v2/util/alignment-allocator.hpp index b9126cc..44bcb7f 100644 --- a/src/arches-v2/util/alignment-allocator.hpp +++ b/src/arches-v2/util/alignment-allocator.hpp @@ -35,14 +35,20 @@ class AlignmentAllocator inline pointer allocate(size_type n) { - // return (pointer)_aligned_malloc(n * sizeof(value_type), N); - return (pointer)aligned_alloc(N, n * sizeof(value_type)); + #if defined BUILD_PLATFORM_WINDOWS + return (pointer)_aligned_malloc(n * sizeof(value_type), N); + #elif defined BUILD_PLATFORM_LINUX + return (pointer)aligned_alloc(N, n * sizeof(value_type)); + #endif } inline void deallocate(pointer p, size_type) { - // _aligned_free(p); - free(p); + #if defined BUILD_PLATFORM_WINDOWS + _aligned_free(p); + #elif defined BUILD_PLATFORM_LINUX + free(p); + #endif } inline void construct(pointer p, const value_type& wert) diff --git a/src/arches-v2/util/bit-manipulation.hpp b/src/arches-v2/util/bit-manipulation.hpp index 912d1ad..a5da0c5 100644 --- a/src/arches-v2/util/bit-manipulation.hpp +++ b/src/arches-v2/util/bit-manipulation.hpp @@ -1,6 +1,15 @@ #pragma once #include "stdafx.hpp" +//cross platform intrinsics +#if defined BUILD_PLATFORM_WINDOWS + #define popcnt64(mask) __popcnt64(mask) + #define rotr64(mask, n) _rotr64(mask, n) +#elif defined BUILD_PLATFORM_LINUX + #define popcnt64(mask) _popcnt64(mask) + #define rotr64(mask, n) _lrotr(mask, n) +#endif + inline uint log2i(uint64_t in) { uint i = 0; @@ -34,14 +43,12 @@ inline uint clz(uint64_t mask) inline uint popcnt(uint64_t mask) { - // return __popcnt64(mask); - return _popcnt64(mask); + return popcnt64(mask); } inline uint64_t rotr(uint64_t mask, uint n) { - // return _rotr64(mask, n); - return _lrotr(mask, n); + return rotr64(mask, n); } inline uint64_t pdep(uint64_t data, uint64_t mask) diff --git a/src/arches-v2/util/stbi.cpp b/src/arches-v2/util/stbi.cpp index 36dfd2a..7a0eb89 100644 --- a/src/arches-v2/util/stbi.cpp +++ b/src/arches-v2/util/stbi.cpp @@ -1,7 +1,10 @@ //stb image +#include "stdafx.hpp" #define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_WRITE_IMPLEMENTATION -// #define STBI_MSC_SECURE_CRT +#ifdef BUILD_PLATFORM_WINDOWS + #define STBI_MSC_SECURE_CRT +#endif //stb image #include "stb_image.h" diff --git a/src/dual-streaming-kernel/CMakeLists.txt b/src/dual-streaming-kernel/CMakeLists.txt index dcd9724..8e8885e 100644 --- a/src/dual-streaming-kernel/CMakeLists.txt +++ b/src/dual-streaming-kernel/CMakeLists.txt @@ -1,8 +1,5 @@ cmake_minimum_required(VERSION 3.14) -# Enable intrinsics like _lzcnt_u64 - find this out -add_compile_options(-march=native) - set(PROJECT_NAME "dual-streaming-kernel") file(GLOB FILES *.hpp *.cpp) diff --git a/src/strata-rt-kernel/CMakeLists.txt b/src/strata-rt-kernel/CMakeLists.txt index f5d0e53..b956b4a 100644 --- a/src/strata-rt-kernel/CMakeLists.txt +++ b/src/strata-rt-kernel/CMakeLists.txt @@ -1,8 +1,5 @@ cmake_minimum_required(VERSION 3.14) -# Enable intrinsics like _lzcnt_u64 - find this out -add_compile_options(-march=native) - set(PROJECT_NAME "strata-rt-kernel") file(GLOB FILES *.hpp *.cpp) diff --git a/src/trax-kernel/CMakeLists.txt b/src/trax-kernel/CMakeLists.txt index 650590e..04fdaf3 100644 --- a/src/trax-kernel/CMakeLists.txt +++ b/src/trax-kernel/CMakeLists.txt @@ -1,10 +1,12 @@ cmake_minimum_required(VERSION 3.14) -# Enable intrinsics like _lzcnt_u64 - find this out -add_compile_options(-march=native) - set(PROJECT_NAME "trax-kernel") +if(LINUX) + # enable intrinsics like _lzcnt_u64 - find this out + add_compile_options(-march=native) +endif() + file(GLOB FILES *.hpp *.cpp) add_executable(${PROJECT_NAME} ${FILES}) target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/trax-kernel) From 028309da914dabdf6e45cda4002d4af6634fd772 Mon Sep 17 00:00:00 2001 From: KaivalyaD Date: Sat, 28 Mar 2026 18:56:44 -0600 Subject: [PATCH 7/8] Add windows header for filename size limit --- CMakeSettings.json | 48 ++++++++++++++++++++++++++++++++++ src/arches-v2/shared-utils.hpp | 1 + 2 files changed, 49 insertions(+) create mode 100644 CMakeSettings.json diff --git a/CMakeSettings.json b/CMakeSettings.json new file mode 100644 index 0000000..5f3cb9c --- /dev/null +++ b/CMakeSettings.json @@ -0,0 +1,48 @@ +{ + "configurations": [ + { + "name": "Debug", + "generator": "Visual Studio 18 2026 Win64", + "configurationType": "Debug", + "inheritEnvironments": [ "msvc_x64_x64" ], + "buildRoot": "${projectDir}\\build\\", + "installRoot": "${projectDir}\\out\\install\\${name}", + "cmakeCommandArgs": "-DCMAKE_POLICY_VERSION_MINIMUM=3.5", + "buildCommandArgs": "", + "ctestCommandArgs": "" + }, + { + "name": "Release", + "generator": "Visual Studio 18 2026 Win64", + "configurationType": "Release", + "buildRoot": "${projectDir}\\build", + "installRoot": "${projectDir}\\out\\install\\${name}", + "cmakeCommandArgs": "-DCMAKE_POLICY_VERSION_MINIMUM=3.5", + "buildCommandArgs": "", + "ctestCommandArgs": "", + "inheritEnvironments": [ "msvc_x64_x64" ] + }, + { + "name": "MinSizeRel", + "generator": "Visual Studio 18 2026 Win64", + "configurationType": "MinSizeRel", + "buildRoot": "${projectDir}\\build\\", + "installRoot": "${projectDir}\\out\\install\\${name}", + "cmakeCommandArgs": "-DCMAKE_POLICY_VERSION_MINIMUM=3.5", + "buildCommandArgs": "", + "ctestCommandArgs": "", + "inheritEnvironments": [ "msvc_x64_x64" ] + }, + { + "name": "RelWithDebInfo", + "generator": "Visual Studio 18 2026 Win64", + "configurationType": "RelWithDebInfo", + "buildRoot": "${projectDir}\\build\\", + "installRoot": "${projectDir}\\out\\install\\${name}", + "cmakeCommandArgs": "-DCMAKE_POLICY_VERSION_MINIMUM=3.5", + "buildCommandArgs": "", + "ctestCommandArgs": "", + "inheritEnvironments": [ "msvc_x64_x64" ] + } + ] +} \ No newline at end of file diff --git a/src/arches-v2/shared-utils.hpp b/src/arches-v2/shared-utils.hpp index 6ce973c..72bdddd 100644 --- a/src/arches-v2/shared-utils.hpp +++ b/src/arches-v2/shared-utils.hpp @@ -18,6 +18,7 @@ #include "rtm/rtm.hpp" #if defined BUILD_PLATFORM_WINDOWS + #include #define MAX_FILENAME_LENGTH MAX_PATH #elif defined BUILD_PLATFORM_LINUX #include From 434b0944161794aa3838d5cc3f7fa41d7375e97d Mon Sep 17 00:00:00 2001 From: Kaivalya Deshpande <63287674+KaivalyaD@users.noreply.github.com> Date: Sat, 28 Mar 2026 19:33:20 -0600 Subject: [PATCH 8/8] Delete CMakeSettings.json --- CMakeSettings.json | 48 ---------------------------------------------- 1 file changed, 48 deletions(-) delete mode 100644 CMakeSettings.json diff --git a/CMakeSettings.json b/CMakeSettings.json deleted file mode 100644 index 5f3cb9c..0000000 --- a/CMakeSettings.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "configurations": [ - { - "name": "Debug", - "generator": "Visual Studio 18 2026 Win64", - "configurationType": "Debug", - "inheritEnvironments": [ "msvc_x64_x64" ], - "buildRoot": "${projectDir}\\build\\", - "installRoot": "${projectDir}\\out\\install\\${name}", - "cmakeCommandArgs": "-DCMAKE_POLICY_VERSION_MINIMUM=3.5", - "buildCommandArgs": "", - "ctestCommandArgs": "" - }, - { - "name": "Release", - "generator": "Visual Studio 18 2026 Win64", - "configurationType": "Release", - "buildRoot": "${projectDir}\\build", - "installRoot": "${projectDir}\\out\\install\\${name}", - "cmakeCommandArgs": "-DCMAKE_POLICY_VERSION_MINIMUM=3.5", - "buildCommandArgs": "", - "ctestCommandArgs": "", - "inheritEnvironments": [ "msvc_x64_x64" ] - }, - { - "name": "MinSizeRel", - "generator": "Visual Studio 18 2026 Win64", - "configurationType": "MinSizeRel", - "buildRoot": "${projectDir}\\build\\", - "installRoot": "${projectDir}\\out\\install\\${name}", - "cmakeCommandArgs": "-DCMAKE_POLICY_VERSION_MINIMUM=3.5", - "buildCommandArgs": "", - "ctestCommandArgs": "", - "inheritEnvironments": [ "msvc_x64_x64" ] - }, - { - "name": "RelWithDebInfo", - "generator": "Visual Studio 18 2026 Win64", - "configurationType": "RelWithDebInfo", - "buildRoot": "${projectDir}\\build\\", - "installRoot": "${projectDir}\\out\\install\\${name}", - "cmakeCommandArgs": "-DCMAKE_POLICY_VERSION_MINIMUM=3.5", - "buildCommandArgs": "", - "ctestCommandArgs": "", - "inheritEnvironments": [ "msvc_x64_x64" ] - } - ] -} \ No newline at end of file