diff --git a/.gitignore b/.gitignore index 3a84d71..d921c36 100644 --- a/.gitignore +++ b/.gitignore @@ -7,8 +7,9 @@ # objs datasets -# build -build +# directories +[Bb]uild* +venv ## log info *_log.txt diff --git a/include/rtm/aabb.hpp b/include/rtm/aabb.hpp index 9071486..55ba41a 100644 --- a/include/rtm/aabb.hpp +++ b/include/rtm/aabb.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include "float.hpp" #include "int.hpp" #include "vec3.hpp" diff --git a/include/rtm/bvh.hpp b/include/rtm/bvh.hpp index 1e6aa34..e18ecbd 100644 --- a/include/rtm/bvh.hpp +++ b/include/rtm/bvh.hpp @@ -1,5 +1,6 @@ #pragma once +#include "macros.hpp" #include "int.hpp" #include "aabb.hpp" #include "mesh.hpp" @@ -718,7 +719,7 @@ class BVH cache.decisions[j] = cache.decisions[j - 1]; } - if(cache.decisions[1].cost == INFINITY) __debugbreak(); + if(cache.decisions[1].cost == INFINITY) add_breakpoint(); } struct CostItem diff --git a/include/rtm/float.hpp b/include/rtm/float.hpp index af842cd..b05785f 100644 --- a/include/rtm/float.hpp +++ b/include/rtm/float.hpp @@ -5,10 +5,20 @@ #ifndef __riscv #include -#include + #if defined BUILD_PLATFORM_WINDOWS + #include + #elif defined BUILD_PLATFORM_LINUX + #include + #endif #include #endif +#ifdef BUILD_PLATFORM_LINUX + // Declarations of intrinsics used that are defined but not declared in immintrin + extern __m128 _mm_cos_ps(__m128 __A); + extern __m128 _mm_sin_ps(__m128 __A); +#endif + namespace rtm { @@ -157,7 +167,7 @@ inline int32_t f32_to_i24(float f32, uint8_t max_exp = 127, int rounding = 0) else if(rounding == 1) norm = std::ceil(norm); - if(norm > ((1 << 23) - 1) || norm < -(1 << 23)) __debugbreak(); + if(norm > ((1 << 23) - 1) || norm < -(1 << 23)) add_breakpoint(); return (int32_t)norm; } @@ -185,7 +195,7 @@ inline uint16_t f32_to_i16(float f32, uint8_t max_exp = 127, int rounding = 0) else if(rounding == 1) norm = std::ceil(norm); - if(norm > ((1 << 15) - 1) || norm < -(1 << 15)) __debugbreak(); + if(norm > ((1 << 15) - 1) || norm < -(1 << 15)) add_breakpoint(); return (int16_t)norm; } #endif diff --git a/include/rtm/HECWBVH.HPP b/include/rtm/hecwbvh.hpp similarity index 100% rename from include/rtm/HECWBVH.HPP rename to include/rtm/hecwbvh.hpp diff --git a/include/rtm/macros.hpp b/include/rtm/macros.hpp index 1096482..9844b51 100644 --- a/include/rtm/macros.hpp +++ b/include/rtm/macros.hpp @@ -2,4 +2,18 @@ #if defined __x86_64__ || defined _M_X64 #define __x86 +#endif + +#if defined _WIN16 || defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64 || defined __WIN32__ || defined __TOS_WIN__ || defined __WINDOWS__ + #define BUILD_PLATFORM_WINDOWS +#elif defined __linux__ + #define BUILD_PLATFORM_LINUX +#endif + +//To add breakpoints for debugging at runtime +#if defined BUILD_PLATFORM_WINDOWS + #define add_breakpoint() __debugbreak() +#elif defined BUILD_PLATFORM_LINUX + #include + #define add_breakpoint() raise(SIGINT) #endif \ No newline at end of file diff --git a/include/rtm/mesh.hpp b/include/rtm/mesh.hpp index 1378f58..7404821 100644 --- a/include/rtm/mesh.hpp +++ b/include/rtm/mesh.hpp @@ -67,8 +67,14 @@ class Mesh public: Mesh(std::string file_path) : mtl_lib("") { - load_obj(file_path.c_str()); - load_mtl(swap_path(mtl_lib, "../../../datasets/"), "../../../datasets/textures/"); + if(!load_obj(file_path.c_str())) { + printf("Mesh: failed to load mesh OBJ file %s\n", file_path.c_str()); + return; + } + if(!load_mtl(swap_path(mtl_lib, "datasets/"), "datasets/textures/")) { + printf("Mesh: failed to load mesh textures or MTL file %s\n", mtl_lib); + return; + } } static inline rtm::vec2 read_vec2(char* line) diff --git a/include/rtm/texture.hpp b/include/rtm/texture.hpp index 0e88a98..f40efea 100644 --- a/include/rtm/texture.hpp +++ b/include/rtm/texture.hpp @@ -21,8 +21,13 @@ class Texture2D int32_t comp; Texel* texels; +private: + // allows Arches to shallow copy textures + // not to be used by kernels!!! + int32_t *pRefCount; + public: - Texture2D() : texels(nullptr) {}; + Texture2D() : texels(nullptr), pRefCount(nullptr) {}; #ifndef __riscv Texture2D(std::string filename) { @@ -39,80 +44,108 @@ class Texture2D texels[i].channel[j] = data[i * comp + j]; stbi_image_free(data); + + pRefCount = (int32_t *)malloc(sizeof(int32_t)); + *pRefCount = 1; // acquired first time by direct assignment + printf("Loaded: %s \n", filename.c_str()); } else { texels = nullptr; + pRefCount = nullptr; printf("Failed: %s \n", filename.c_str()); } } - Texture2D(const Texture2D& other) + Texture2D(const Texture2D& other) : width(other.width), height(other.height), comp(other.comp) + { + other.acquire(); + texels = other.texels; + pRefCount = other.pRefCount; + } + + Texture2D& operator=(const Texture2D& other) + { + if (this == &other) + return *this; + + // copy other members + width = other.width; + height = other.height; + comp = other.comp; + + // following order is safer than 'first release, then acquire' + other.acquire(); // prebook ownership of new resource + release(); // release ownership of current resource + texels = other.texels; // acquire ownership of new resource + pRefCount = other.pRefCount; + return *this; + } + + ~Texture2D() { - memcpy(this, &other, sizeof(Texture2D)); - uint32_t size = sizeof(Texel) * width * height; - texels = (Texel*)malloc(size); - memcpy(texels, other.texels, size); + release(); } +#endif - Texture2D& operator=(const Texture2D& other) + rtm::vec4 sample(const rtm::vec2& uv) const { - if(texels) free(texels); - memcpy(this, &other, sizeof(Texture2D)); - uint32_t size = sizeof(Texel) * width * height; - texels = (Texel*)malloc(size); - memcpy(texels, other.texels, size); - return *this; + if(width == 0 && height == 0) return rtm::vec4(0.0f); + return read_nearest(uv); } - ~Texture2D() + rtm::uvec2 get_iuv(const rtm::vec2& uv, const rtm::vec2& offset = rtm::vec2(0.0f)) const { - if(texels) - free(texels); + rtm::vec2 fuv = uv * rtm::vec2(width, height) + offset; + return rtm::uvec2(fuv[0], fuv[1]); } -#endif - rtm::vec3 sample(const rtm::vec2& uv) const + Texel* get_texel_addr(const rtm::uvec2& iuv) const { - if(width == 0 && height == 0) return rtm::vec3(0.0f); - if(width == 1 && height == 1) return read({0, 0}); - - rtm::vec2 _uv = rtm::mod(uv, rtm::vec2(1.0f)) * rtm::vec2(width, height); - //return read_nearest(_uv); - - //rtm::vec2 _uv = rtm::mod(uv, rtm::vec2(1.0f)) * rtm::vec2(width, height); - rtm::vec3 s00 = read_nearest(_uv + rtm::vec2(-0.5f, -0.5f)); - rtm::vec3 s10 = read_nearest(_uv + rtm::vec2(0.5f, -0.5f)); - rtm::vec3 s01 = read_nearest(_uv + rtm::vec2(-0.5f, 0.5f)); - rtm::vec3 s11 = read_nearest(_uv + rtm::vec2(0.5f, 0.5f)); - - rtm::vec2 ic = rtm::mod(_uv + rtm::vec2(0.5f), rtm::vec2(1.0f)); - rtm::vec3 s0 = rtm::mix(s00, s01, ic.y); - rtm::vec3 s1 = rtm::mix(s10, s11, ic.y); - - return rtm::mix(s0, s1, ic.x); + uint32_t x = iuv[0] % width; + uint32_t y = iuv[1] % height; + return &texels[y * width + x]; } -private: - rtm::vec3 read(const rtm::uvec2& iuv) const + static rtm::vec4 decode_texel(const Texel& texel) { - uint i = iuv[1] * width + iuv[0]; - return rtm::vec3(texels[i].channel[0], texels[i].channel[1], texels[i].channel[2]) * (1.0f / 255.0f); + return rtm::vec4(texel.channel[0], texel.channel[1], texel.channel[2], texel.channel[3]) * (1.0f / 255.0f); } - rtm::uvec2 get_nearest(const rtm::vec2& uv) const + rtm::vec4 read_nearest(const rtm::vec2& uv, rtm::vec2 offset = rtm::vec2(0.0f)) const { - rtm::uvec2 mv = rtm::uvec2(width, height); - rtm::uvec2 iuv = rtm::uvec2(uv[0], uv[1]) + mv; - iuv[0] = iuv[0] % mv[0]; iuv[1] = iuv[1] % mv[1]; - return iuv; + rtm::uvec2 iuv = get_iuv(uv, offset); + Texel* texel = get_texel_addr(iuv); + return decode_texel(*texel); } - rtm::vec3 read_nearest(const rtm::vec2& uv) const +#ifndef __riscv +private: + void acquire(void) const { - rtm::vec3 v = read(get_nearest(uv)); - return v; + if (!pRefCount) + return; + + (*pRefCount)++; } + + void release(void) + { + if (!pRefCount) + return; + + (*pRefCount)--; + if((*pRefCount) <= 0) + { + if(texels) + free(texels); + texels = nullptr; + + free(pRefCount); + pRefCount = nullptr; + } + } +#endif }; diff --git a/scripts/log-parser.py b/scripts/log-parser.py index 655289c..8ed6de0 100644 --- a/scripts/log-parser.py +++ b/scripts/log-parser.py @@ -39,6 +39,10 @@ def gauss(n=11,sigma=1): data[i].append(float(re.findall(r'[\d]*[.][\d]+', line)[0])) +print(f"Total Cycles found: {len(cycles)}") +for i in range(len(keys)): + print(f"Key '{keys[i]}' found {len(data[i])} times") + show_unsmoothed = True if show_unsmoothed: for i in range(len(keys)): @@ -49,13 +53,33 @@ def gauss(n=11,sigma=1): kernel = np.array(gauss(kernel_size, 5)) kernel = kernel / kernel.sum() +# for i in range(len(keys)): +# if len(data[i]) == len(cycles): +# plt.plot(cycles, np.convolve(data[i], kernel, mode='same'), colors[i], label=keys[i]) + for i in range(len(keys)): - if len(data[i]) == len(cycles): - plt.plot(cycles, np.convolve(data[i], kernel, mode='same'), colors[i], label=keys[i]) + # determine the shortest length to keep (x, y) in sync + min_len = min(len(data[i]), len(cycles)) + + if min_len > 0: + # slice both to min_len so (x, y) match perfectly + x = cycles[:min_len] + y = data[i][:min_len] + + # plot unsmoothed (faded) + plt.plot(x, y, colors[i], alpha=0.125) + + # plot smoothed (solid with label) + smoothed = np.convolve(y, kernel, mode='same') + plt.plot(x, smoothed, colors[i], label=keys[i]) + else: + print(f"Skipping {keys[i]} - no data matches found.") plt.legend(loc="upper right") plt.ylabel(ylabel) plt.xlabel('Time (cycles)') plt.yscale('log') plt.grid() -plt.show() \ No newline at end of file +plt.savefig("simulation_results.png") # save instead of show +print("Plot saved to simulation_results.png") +#plt.show() \ No newline at end of file diff --git a/src/arches-v2/CMakeLists.txt b/src/arches-v2/CMakeLists.txt index 977b40f..e2067c8 100644 --- a/src/arches-v2/CMakeLists.txt +++ b/src/arches-v2/CMakeLists.txt @@ -3,6 +3,11 @@ add_compile_definitions(UNICODE _UNICODE) set(PROJECT_NAME "arches-v2") +if(LINUX) + # enable intrinsics like _lzcnt_u64 + add_compile_options(-march=native) +endif() + file(GLOB_RECURSE ALL_INCLUDE CONFIGURE_DEPENDS "*.hpp" "*.h") file(GLOB_RECURSE ALL_SOURCES CONFIGURE_DEPENDS "*.cpp" "*.cc") file(GLOB_RECURSE USIMM_CONFIG CONFIGURE_DEPENDS "*.vi" "*.cfg") @@ -53,8 +58,12 @@ target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/src) set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT DISABLE) set_target_properties(${PROJECT_NAME} PROPERTIES OUTPUT_NAME ${PROJECT_NAME}) -target_link_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/libraries/tbb) -target_link_libraries(${PROJECT_NAME} PRIVATE tbb12.lib) +if(WINDOWS) + target_link_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/libraries/tbb) + target_link_libraries(${PROJECT_NAME} PRIVATE tbb12.lib) +elseif(LINUX) + target_link_libraries(${PROJECT_NAME} PRIVATE tbb) +endif() target_link_libraries(${PROJECT_NAME} PRIVATE Ramulator) #set_target_properties(${PROJECT_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG ${CMAKE_CURRENT_BINARY_DIR}) #set_target_properties(${PROJECT_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/src/arches-v2/dual-streaming.hpp b/src/arches-v2/dual-streaming.hpp deleted file mode 100644 index a64e150..0000000 --- a/src/arches-v2/dual-streaming.hpp +++ /dev/null @@ -1,752 +0,0 @@ -#pragma once - -#include "shared-utils.hpp" -#include "units/dual-streaming/unit-treelet-rt-core.hpp" -#include "units/dual-streaming/unit-stream-scheduler.hpp" -#include "units/dual-streaming/unit-ray-staging-buffer.hpp" -#include "units/dual-streaming/unit-tp.hpp" -#include "units/dual-streaming/unit-hit-record-updater.hpp" -#include "units/dual-streaming/unit-scene-buffer.hpp" - -namespace Arches { - -namespace ISA { namespace RISCV { namespace DualStreaming { - -//see the opcode map for details -const static InstructionInfo isa_custom0_000_imm[8] = -{ - InstructionInfo(0x0, "fchthrd", InstrType::CUSTOM0, Encoding::U, RegFile::INT, MEM_REQ_DECL - { - MemoryRequest req; - req.type = MemoryRequest::Type::LOAD; - req.size = sizeof(uint32_t); - req.dst.push(DstReg(instr.rd, RegType::UINT32).u9, 9); - req.vaddr = 0x0ull; - - return req; - }), - InstructionInfo(0x1, "boxisect", InstrType::CUSTOM1, Encoding::U, RegFile::FLOAT, EXEC_DECL - { - Register32 * fr = unit->float_regs->registers; - - rtm::Ray ray; - rtm::vec3 inv_d; - ray.o.x = fr[0].f32; - ray.o.y = fr[1].f32; - ray.o.z = fr[2].f32; - ray.t_min = fr[3].f32; - inv_d.x = fr[4].f32; - inv_d.y = fr[5].f32; - inv_d.z = fr[6].f32; - ray.t_max = fr[7].f32; - - rtm::AABB aabb; - aabb.min.x = fr[8].f32; - aabb.min.y = fr[9].f32; - aabb.min.z = fr[10].f32; - aabb.max.x = fr[11].f32; - aabb.max.y = fr[12].f32; - aabb.max.z = fr[13].f32; - - unit->float_regs->registers[instr.u.rd].f32 = rtm::intersect(aabb, ray, inv_d); - }), - InstructionInfo(0x2, "triisect", InstrType::CUSTOM2, Encoding::U, RegFile::FLOAT, EXEC_DECL - { - Register32 * fr = unit->float_regs->registers; - - rtm::Ray ray; - ray.o.x = fr[0].f32; - ray.o.y = fr[1].f32; - ray.o.z = fr[2].f32; - ray.t_min = fr[3].f32; - ray.d.x = fr[4].f32; - ray.d.y = fr[5].f32; - ray.d.z = fr[6].f32; - ray.t_max = fr[7].f32; - - rtm::Triangle tri; - tri.vrts[0].x = fr[8].f32; - tri.vrts[0].y = fr[9].f32; - tri.vrts[0].z = fr[10].f32; - tri.vrts[1].x = fr[11].f32; - tri.vrts[1].y = fr[12].f32; - tri.vrts[1].z = fr[13].f32; - tri.vrts[2].x = fr[14].f32; - tri.vrts[2].y = fr[15].f32; - tri.vrts[2].z = fr[16].f32; - - rtm::Hit hit; - hit.t = fr[17].f32; - hit.bc[0] = fr[18].f32; - hit.bc[1] = fr[19].f32; - hit.id = fr[20].u32; - - rtm::intersect(tri, ray, hit); - - fr[17].f32 = hit.t; - fr[18].f32 = hit.bc[0]; - fr[19].f32 = hit.bc[1]; - fr[20].u32 = hit.id; - }), -}; - -const static InstructionInfo isa_custom0_funct3[8] = -{ - InstructionInfo(0x0, META_DECL{return isa_custom0_000_imm[instr.u.imm_31_12 >> 3]; }), - InstructionInfo(0x1, "lwi", InstrType::CUSTOM3, Encoding::I, RegFile::FLOAT, RegFile::INT, MEM_REQ_DECL - { - //load bucket ray into registers [rd - (rd + N)] - MemoryRequest mem_req; - mem_req.type = MemoryRequest::Type::LOAD; - mem_req.size = sizeof(WorkItem); - mem_req.dst.push(DstReg(instr.rd, RegType::FLOAT32).u9, 9); - mem_req.vaddr = unit->int_regs->registers[instr.i.rs1].u64 + i_imm(instr); - - return mem_req; - }), - InstructionInfo(0x2, "swi", InstrType::CUSTOM4, Encoding::S, RegFile::FLOAT, RegFile::INT, MEM_REQ_DECL - { - //store bucket ray to hit record updater - MemoryRequest mem_req; - mem_req.type = MemoryRequest::Type::STORE; - mem_req.size = sizeof(WorkItem); - mem_req.vaddr = unit->int_regs->registers[instr.s.rs1].u64 + s_imm(instr); - - Register32* fr = unit->float_regs->registers; - for(uint i = 0; i < sizeof(WorkItem) / sizeof(float); ++i) - ((float*)mem_req.data)[i] = fr[instr.s.rs2 + i].f32; - - return mem_req; - }), - InstructionInfo(0x3, "cshit", InstrType::CUSTOM5, Encoding::S, RegFile::FLOAT, RegFile::INT, MEM_REQ_DECL - { - MemoryRequest mem_req; - mem_req.type = MemoryRequest::Type::STORE; - mem_req.size = sizeof(rtm::Hit); - mem_req.vaddr = unit->int_regs->registers[instr.s.rs1].u64 + s_imm(instr); - - Register32* fr = unit->float_regs->registers; - for(uint i = 0; i < sizeof(rtm::Hit) / sizeof(float); ++i) - ((float*)mem_req.data)[i] = fr[instr.s.rs2 + i].f32; - - return mem_req; - }), - InstructionInfo(0x4, "lhit", InstrType::CUSTOM6, Encoding::I, RegFile::FLOAT, RegFile::INT, MEM_REQ_DECL - { - //load hit record into registers [rd - (rd + N)] - MemoryRequest mem_req; - mem_req.type = MemoryRequest::Type::LOAD; - mem_req.size = sizeof(rtm::Hit); - mem_req.dst.push(DstReg(instr.rd, RegType::FLOAT32).u9, 9); - mem_req.vaddr = unit->int_regs->registers[instr.i.rs1].u64 + i_imm(instr); - - return mem_req; - }), - InstructionInfo(0x5, "traceray", InstrType::CUSTOM7, Encoding::I, RegFile::FLOAT, MEM_REQ_DECL - { - //load hit record into registers [rd - (rd + N)] - MemoryRequest mem_req; - mem_req.type = MemoryRequest::Type::LOAD; - mem_req.size = sizeof(rtm::Hit); - mem_req.dst.push(DstReg(instr.rd, RegType::FLOAT32).u9, 9); - mem_req.vaddr = unit->int_regs->registers[instr.i.rs1].u64 + i_imm(instr); - - return mem_req; - }), -}; - -const static InstructionInfo custom0(CUSTOM_OPCODE0, META_DECL{return isa_custom0_funct3[instr.i.funct3];}); - -}}} - -namespace DualStreaming { - -class UnitL1Cache : public Units::UnitCache -{ -private: - std::pair _treelet_range; - -public: - UnitL1Cache(const UnitCache::Configuration& config, const std::pair& treelet_range) : - UnitCache(config), _treelet_range(treelet_range) - { - } - -private: - UnitMemoryBase* _get_mem_higher(paddr_t addr) override - { - if(addr >= _treelet_range.first && addr < _treelet_range.second) - return _mem_highers[1]; - - return _mem_highers[0]; - } -}; - -#include "dual-streaming-kernel/include.hpp" -#include "dual-streaming-kernel/intersect.hpp" - -typedef Units::UnitCache UnitL2Cache; -typedef Units::UnitDRAMRamulator UnitDRAM; -typedef rtm::CompressedWideTreeletBVH::Treelet SceneSegment; - -static DualStreamingKernelArgs initilize_buffers(Units::UnitMainMemoryBase* main_memory, paddr_t& heap_address, const SimulationConfig& sim_config, uint page_size) -{ - std::string scene_name = sim_config.get_string("scene_name"); - std::string project_folder = get_project_folder_path(); - std::string scene_file = project_folder + "datasets\\" + scene_name + ".obj"; - std::string bvh_cache_filename = project_folder + "datasets\\cache\\" + scene_name + ".bvh"; - - DualStreamingKernelArgs args; - args.framebuffer_width = sim_config.get_int("framebuffer_width"); - args.framebuffer_height = sim_config.get_int("framebuffer_height"); - args.framebuffer_size = args.framebuffer_width * args.framebuffer_height; - heap_address = align_to(page_size, heap_address); - args.framebuffer = reinterpret_cast(heap_address); - heap_address += args.framebuffer_size * sizeof(uint32_t); - - std::vector hits(args.framebuffer_size, {T_MAX, rtm::vec2(0.0), ~0u}); - args.hit_records = write_vector(main_memory, page_size, hits, heap_address); - - args.light_dir = rtm::normalize(rtm::vec3(4.5f, 42.5f, 5.0f)); - - args.camera = sim_config.camera; - - args.pregen_rays = sim_config.get_int("pregen_rays"); - args.use_early = sim_config.get_int("use_early"); - args.hit_delay = sim_config.get_int("hit_delay"); - - rtm::Mesh mesh(scene_file); - std::vector build_objects; - mesh.get_build_objects(build_objects); - - rtm::BVH bvh2(bvh_cache_filename, build_objects); - mesh.reorder(build_objects); - - std::vector rays(args.framebuffer_size); - if(args.pregen_rays) - pregen_rays(args.framebuffer_width, args.framebuffer_height, args.camera, bvh2, mesh, sim_config.get_int("pregen_bounce"), rays); - args.rays = write_vector(main_memory, CACHE_BLOCK_SIZE, rays, heap_address); - -#if DS_USE_COMPRESSED_WIDE_BVH - //rtm::WBVH wbvh(bvh2, build_objects, &mesh, false); - //mesh.reorder(build_objects); - - //rtm::NVCWBVH cwbvh(wbvh); - - //rtm::CompressedWideTreeletBVH cwtbvh(cwbvh, wbvh.ft_blocks.data()); - //args.treelets = write_vector(main_memory, page_size, cwtbvh.treelets, heap_address); - //args.treelet_headers = write_vector(main_memory, page_size, cwtbvh.treelet_headers, heap_address); - //args.num_treelets = cwtbvh.treelets.size(); -#else - rtm::WBVH wbvh(bvh2, build_objects); - mesh.reorder(build_objects); - - rtm::WideTreeletBVH wtbvh(wbvh, mesh); - args.treelets = write_vector(main_memory, page_size, wtbvh.treelets, heap_address); - args.num_treelets = wtbvh.treelets.size(); -#endif - - std::vector tris; - mesh.get_triangles(tris); - args.tris = write_vector(main_memory, CACHE_BLOCK_SIZE, tris, heap_address); - - main_memory->direct_write(&args, sizeof(DualStreamingKernelArgs), DS_KERNEL_ARGS_ADDRESS); - return args; -} - -static void run_sim_dual_streaming(const SimulationConfig& sim_config) -{ - std::string project_folder = get_project_folder_path(); - -#if 1 //Modern config - double clock_rate = 2.0e9; - uint num_threads = 8; - uint num_tps = 64; - uint num_tms = 64; - uint64_t stack_size = 512; - - //Memory - uint num_partitions = 16; - uint partition_stride = 1 << 13; - - //DRAM - UnitDRAM::Configuration dram_config; - dram_config.config_path = project_folder + "build\\src\\arches-v2\\config-files\\gddr6_pch_config.yaml"; - dram_config.size = 4ull << 30; //4GB - dram_config.num_controllers = num_partitions; - dram_config.partition_stride = partition_stride; - - //L2$ - UnitL2Cache::Configuration l2_config; - l2_config.level = 2; - l2_config.size = 4 << 20; - l2_config.associativity = 16; - l2_config.num_ports = num_tms; - l2_config.num_slices = num_partitions; - l2_config.num_banks = 2; - l2_config.crossbar_width = 64; - l2_config.num_mshr = 192; - l2_config.latency = 170; - - UnitL2Cache::PowerConfig l2_power_config; - l2_power_config.leakage_power = 184.55e-3f * l2_config.num_banks; - l2_power_config.tag_energy = 0.00756563e-9f; - l2_power_config.read_energy = 0.378808e-9f - l2_power_config.tag_energy; - l2_power_config.write_energy = 0.365393e-9f - l2_power_config.tag_energy; - - //L1d$ - UnitL1Cache::Configuration l1d_config; - l1d_config.level = 1; - l1d_config.size = 64 << 10; - l1d_config.associativity = 32; - l1d_config.num_banks = 4; - l1d_config.crossbar_width = 4; - l1d_config.num_mshr = 256; - l1d_config.latency = 20; - - UnitL1Cache::PowerConfig l1d_power_config; - l1d_power_config.leakage_power = 7.19746e-3f * l1d_config.num_banks * num_tms; - l1d_power_config.tag_energy = 0.000663943e-9f; - l1d_power_config.read_energy = 0.0310981e-9f - l1d_power_config.tag_energy; - l1d_power_config.write_energy = 0.031744e-9f - l1d_power_config.tag_energy; - - //Scene buffer - Units::DualStreaming::UnitSceneBuffer::Configuration scene_buffer_config; - scene_buffer_config.size = 4 * 1024 * 1024; // 4MB - scene_buffer_config.latency = 4; - scene_buffer_config.num_banks = 32; - scene_buffer_config.bank_select_mask = generate_nbit_mask(log2i(scene_buffer_config.num_banks)) << log2i(CACHE_BLOCK_SIZE); - - Units::DualStreaming::UnitSceneBuffer::PowerConfig scene_buffer_power_config; - scene_buffer_power_config.leakage_power = 53.7192e-3f * scene_buffer_config.num_banks; - scene_buffer_power_config.read_energy = 0.118977e-9f; - scene_buffer_power_config.write_energy = 0.118977e-9f; -#else //Legacy config - -#endif - - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM0] = "FCHTHRD"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM1] = "BOXISECT"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM2] = "TRIISECT"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM3] = "LWI"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM4] = "SWI"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM5] = "CSHIT"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM6] = "LHIT"; - ISA::RISCV::isa[ISA::RISCV::CUSTOM_OPCODE0] = ISA::RISCV::DualStreaming::custom0; - - uint num_sfus = static_cast(ISA::RISCV::InstrType::NUM_TYPES) * num_tms; - - Simulator simulator; - std::vector tps; - std::vector sfus; - std::vector thread_schedulers; - std::vector*> rtcs; - std::vector rsbs; - std::vector l1ds; - std::vector> unit_tables; unit_tables.reserve(num_tms); - std::vector> sfu_lists; sfu_lists.reserve(num_tms); - std::vector> mem_lists; mem_lists.reserve(num_tms); - - uint dram_ports_per_controller = 4; - dram_config.num_ports = dram_ports_per_controller * dram_config.num_controllers; - UnitDRAM dram(dram_config); - - simulator.register_unit(&dram); - simulator.new_unit_group(); - - Units::UnitBuffer::Configuration sram_config; - sram_config.latency = 1; - sram_config.size = 1 << 30; - sram_config.num_banks = num_partitions; - sram_config.num_ports = dram_ports_per_controller * num_partitions; - - Units::UnitBuffer sram(sram_config); - simulator.register_unit(&sram); - simulator.new_unit_group(); - - ELF elf(project_folder + "src\\dual-streaming-kernel\\riscv\\kernel"); - - dram.clear(); - paddr_t heap_address = elf.load(dram._data_u8); - DualStreamingKernelArgs kernel_args = DualStreaming::initilize_buffers(&dram, heap_address, sim_config, partition_stride); - heap_address = align_to(partition_stride * num_partitions, heap_address); - std::pair treelet_range = {(paddr_t)kernel_args.treelets, (paddr_t)kernel_args.treelets + kernel_args.num_treelets * sizeof(SceneSegment)}; - - std::set unused_dram_ports; - for(uint i = 0; i < dram_ports_per_controller; ++i) - unused_dram_ports.insert(i); - - l2_config.num_ports = num_tms; - l2_config.mem_highers = {&dram}; - l2_config.mem_higher_port = *unused_dram_ports.begin(); - l2_config.mem_higher_port_stride = dram_ports_per_controller; - unused_dram_ports.erase(*unused_dram_ports.begin()); - - UnitL2Cache l2(l2_config); - simulator.register_unit(&l2); - - Units::UnitAtomicRegfile atomic_regs(num_tms); - simulator.register_unit(&atomic_regs); - simulator.new_unit_group(); - - scene_buffer_config.segment_start = (paddr_t)kernel_args.treelets; - scene_buffer_config.segment_size = sizeof(SceneSegment); - scene_buffer_config.num_ports = num_tms; - scene_buffer_config.row_size = partition_stride; - scene_buffer_config.block_size = CACHE_BLOCK_SIZE; - scene_buffer_config.num_channels = num_partitions; - scene_buffer_config.main_mem = &dram; - scene_buffer_config.main_mem_port_offset = *unused_dram_ports.begin(); - scene_buffer_config.main_mem_port_stride = dram_ports_per_controller; - unused_dram_ports.erase(*unused_dram_ports.begin()); - - Units::DualStreaming::UnitSceneBuffer scene_buffer(scene_buffer_config); - simulator.register_unit(&scene_buffer); - - Units::DualStreaming::UnitStreamScheduler::Configuration stream_scheduler_config; - stream_scheduler_config.num_banks = 32; - stream_scheduler_config.num_channels = num_partitions; - stream_scheduler_config.traversal_scheme = sim_config.get_int("traversal_scheme"); - stream_scheduler_config.weight_scheme = sim_config.get_int("weight_scheme"); - stream_scheduler_config.num_tms = num_tms; - stream_scheduler_config.block_size = CACHE_BLOCK_SIZE; - stream_scheduler_config.row_size = partition_stride; - stream_scheduler_config.num_root_rays = kernel_args.framebuffer_size; - stream_scheduler_config.treelet_addr = *(paddr_t*)&kernel_args.treelets; - stream_scheduler_config.heap_addr = *(paddr_t*)&heap_address; - stream_scheduler_config.cheat_treelets = (rtm::WideTreeletBVH::Treelet::Header*)&dram._data_u8[(size_t)kernel_args.treelet_headers]; - stream_scheduler_config.main_mem = &dram; - stream_scheduler_config.main_mem_port_stride = dram_ports_per_controller; - stream_scheduler_config.main_mem_port_offset = *unused_dram_ports.begin(); - unused_dram_ports.erase(*unused_dram_ports.begin()); - - if(sim_config.get_int("use_scene_buffer")) - { - stream_scheduler_config.max_active_segments = scene_buffer_config.size / sizeof(SceneSegment); - stream_scheduler_config.scene_buffer = &scene_buffer; - } - else - { - stream_scheduler_config.l2_cache_port = l2_config.num_ports - 1; - stream_scheduler_config.l2_cache = nullptr; // &l2; - stream_scheduler_config.max_active_segments = num_tms * 2; - } - if(sim_config.get_int("rays_on_chip")) - { - stream_scheduler_config.main_mem = &sram; - } - - Units::DualStreaming::UnitStreamScheduler stream_scheduler(stream_scheduler_config); - simulator.register_unit(&stream_scheduler); - - Units::DualStreaming::UnitHitRecordUpdater::Configuration hit_record_updater_config; - hit_record_updater_config.num_tms = num_tms; - hit_record_updater_config.hit_record_start = *(paddr_t*)&kernel_args.hit_records; - hit_record_updater_config.cache_size = sim_config.get_int("hit_buffer_size"); // 128 * 16 = 2048B = 2KB - hit_record_updater_config.associativity = 8; - hit_record_updater_config.row_size = partition_stride; - hit_record_updater_config.num_channels = num_partitions; - hit_record_updater_config.main_mem = &dram; - hit_record_updater_config.main_mem_port_stride = dram_ports_per_controller; - hit_record_updater_config.main_mem_port_offset = *unused_dram_ports.begin(); - unused_dram_ports.erase(*unused_dram_ports.begin()); - - if(sim_config.get_int("hits_on_chip")) - { - std::vector hits(kernel_args.framebuffer_size); - for(auto& hit : hits) hit.t = T_MAX; - paddr_t address = *(paddr_t*)&kernel_args.hit_records; - write_vector(&sram, partition_stride, hits, address); - hit_record_updater_config.main_mem = &sram; - } - - Units::DualStreaming::UnitHitRecordUpdater hit_record_updater(hit_record_updater_config); - simulator.register_unit(&hit_record_updater); - simulator.new_unit_group(); - - l1d_config.num_ports = num_tps; -#ifdef DS_USE_RT_CORE - l1d_config.num_ports += 1 * l1d_config.num_ports / l1d_config.crossbar_width; //add extra port for RT core - l1d_config.crossbar_width += 1; -#endif - l1d_config.mem_highers = {&l2, &scene_buffer}; - for(uint tm_index = 0; tm_index < num_tms; ++tm_index) - { - std::vector unit_table((uint)ISA::RISCV::InstrType::NUM_TYPES, nullptr); - std::vector mem_list; - std::vector sfu_list; - - l1d_config.mem_higher_port = tm_index; - l1ds.push_back(_new UnitL1Cache(l1d_config, sim_config.get_int("use_scene_buffer") ? treelet_range : std::pair(0ull, 0ull))); - simulator.register_unit(l1ds.back()); - mem_list.push_back(l1ds.back()); - unit_table[(uint)ISA::RISCV::InstrType::LOAD] = l1ds.back(); - unit_table[(uint)ISA::RISCV::InstrType::STORE] = l1ds.back(); - - #if DS_USE_RT_CORE - rsbs.push_back(_new Units::DualStreaming::UnitRayStagingBuffer(1, tm_index, &stream_scheduler, &hit_record_updater)); - simulator.register_unit(rsbs.back()); - - Units::DualStreaming::UnitTreeletRTCore::Configuration rtc_config; - rtc_config.max_rays = 256; - rtc_config.num_tp = num_tps; - rtc_config.treelet_base_addr = (paddr_t)kernel_args.treelets; - rtc_config.hit_record_base_addr = (paddr_t)kernel_args.hit_records; - rtc_config.use_early_termination = sim_config.get_int("use_early"); - rtc_config.cache = l1ds.back(); - rtc_config.rsb = rsbs.back(); - - rtcs.push_back(_new Units::DualStreaming::UnitTreeletRTCore(rtc_config)); - simulator.register_unit(rtcs.back()); - mem_list.push_back(rtcs.back()); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM4] = rtcs.back(); //SWI - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM6] = rtcs.back(); //LHIT - #else - rsbs.push_back(_new Units::DualStreaming::UnitRayStagingBuffer(num_tps, tm_index, &stream_scheduler, &hit_record_updater)); - simulator.register_unit(rsbs.back()); - mem_list.push_back(rsbs.back()); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM3] = rsbs.back(); //LWI - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM4] = rsbs.back(); //SWI - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM5] = rsbs.back(); //CSHIT - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM6] = rsbs.back(); //LHIT - #endif - - thread_schedulers.push_back(_new Units::UnitThreadScheduler(num_tps, tm_index, &atomic_regs, 64)); - simulator.register_unit(thread_schedulers.back()); - mem_list.push_back(thread_schedulers.back()); - unit_table[(uint)ISA::RISCV::InstrType::ATOMIC] = thread_schedulers.back(); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM0] = thread_schedulers.back(); - - - //sfu_list.push_back(_new Units::UnitSFU(num_tps, 2, 1, num_tps)); - //simulator.register_unit(sfu_list.back()); - //unit_table[(uint)ISA::RISCV::InstrType::FADD] = sfu_list.back(); - //unit_table[(uint)ISA::RISCV::InstrType::FMUL] = sfu_list.back(); - //unit_table[(uint)ISA::RISCV::InstrType::FFMAD] = sfu_list.back(); - - sfu_list.push_back(_new Units::UnitSFU(num_tps / 8, 1, 1, num_tps)); - simulator.register_unit(sfu_list.back()); - unit_table[(uint)ISA::RISCV::InstrType::IMUL] = sfu_list.back(); - unit_table[(uint)ISA::RISCV::InstrType::IDIV] = sfu_list.back(); - - sfu_list.push_back(_new Units::UnitSFU(num_tps / 16, 6, 1, num_tps)); - simulator.register_unit(sfu_list.back()); - unit_table[(uint)ISA::RISCV::InstrType::FDIV] = sfu_list.back(); - unit_table[(uint)ISA::RISCV::InstrType::FSQRT] = sfu_list.back(); - - #if DS_USE_HARDWARE_INTERSECTORS - sfu_list.push_back(_new Units::UnitSFU(2, 3, 1, num_tps_per_tm)); - simulator.register_unit(sfu_list.back()); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM1] = sfu_list.back(); - - sfu_list.push_back(_new Units::UnitSFU(1, 22, 8, num_tps_per_tm)); - simulator.register_unit(sfu_list.back()); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM2] = sfu_list.back(); - #endif - - for(auto& sfu : sfu_list) - sfus.push_back(sfu); - - unit_tables.emplace_back(unit_table); - sfu_lists.emplace_back(sfu_list); - mem_lists.emplace_back(mem_list); - - for(uint tp_index = 0; tp_index < num_tps; ++tp_index) - { - Units::UnitTP::Configuration tp_config; - tp_config.tp_index = tp_index; - tp_config.tm_index = tm_index; - tp_config.stack_size = stack_size; - tp_config.cheat_memory = dram._data_u8; - tp_config.unit_table = &unit_tables.back(); - tp_config.unique_mems = &mem_lists.back(); - tp_config.unique_sfus = &sfu_lists.back(); - tp_config.num_threads = num_threads; - - tps.push_back(new Units::DualStreaming::UnitTP(tp_config)); - simulator.register_unit(tps.back()); - } - - simulator.new_unit_group(); - } - - printf("Starting Dual-Streaming\n"); - for(auto& tp : tps) - tp->set_entry_point(elf.elf_header->e_entry.u64); - - //master logs - Units::UnitBuffer::Log sram_log; - UnitDRAM::Log dram_log; - UnitL2Cache::Log l2_log; - UnitL1Cache::Log l1d_log; - Units::UnitTP::Log tp_log; - - Units::DualStreaming::UnitTreeletRTCore::Log rtc_log; - Units::DualStreaming::UnitSceneBuffer::Log sb_log; - Units::DualStreaming::UnitStreamScheduler::Log ss_log; - - uint delta = sim_config.get_int("logging_interval"); - auto start = std::chrono::high_resolution_clock::now(); - simulator.execute(delta, [&]() -> void - { - float delta_us = delta / (clock_rate / 1'000'000); - - Units::UnitBuffer::Log sram_delta_log = delta_log(sram_log, sram); - UnitDRAM::Log dram_delta_log = delta_log(dram_log, dram); - UnitL2Cache::Log l2_delta_log = delta_log(l2_log, l2); - UnitL1Cache::Log l1d_delta_log = delta_log(l1d_log, l1ds); - - Units::DualStreaming::UnitTreeletRTCore::Log rtc_delta_log = delta_log(rtc_log, rtcs); - Units::DualStreaming::UnitSceneBuffer::Log sb_delta_log = delta_log(sb_log, scene_buffer); - Units::DualStreaming::UnitStreamScheduler::Log ss_delta_log = delta_log(ss_log, stream_scheduler); - - printf(" \n"); - printf("Cycle: %lld \n", simulator.current_cycle); - printf("Threads Launched: %d \n", atomic_regs.iregs[0]); - printf("Buckets Launched: %lld \n", ss_log.buckets_launched); - printf("Segments Launched: %lld \n", ss_log.segments_launched); - printf("Prefetch queue size: %lld \n", stream_scheduler._l2_cache_prefetch_queue.size()); - printf(" \n"); - printf(" Ray Total: %8.1f bytes/cycle\n", (float)(ss_delta_log.buckets_generated + ss_delta_log.buckets_launched) * RAY_BUCKET_SIZE / delta); - printf(" Ray Write: %8.1f bytes/cycle\n", (float)ss_delta_log.buckets_generated * RAY_BUCKET_SIZE / delta); - printf(" Ray Read: %8.1f bytes/cycle\n", (float)ss_delta_log.buckets_launched * RAY_BUCKET_SIZE / delta); - printf(" Ray Write: %8.1f Mrays/s \n", (float)(ss_delta_log.buckets_generated) * Arches::Units::DualStreaming::RayBucket::MAX_RAYS / delta_us); - printf(" Ray Read: %8.1f Mrays/s \n", (float)(ss_delta_log.buckets_launched) * Arches::Units::DualStreaming::RayBucket::MAX_RAYS / delta_us); - printf(" \n"); - printf("Scene Fill: %8.1f bytes/cycle\n", (float)sb_delta_log.bytes_written / delta); - printf("Scene Read: %8.1f bytes/cycle\n", (float)sb_delta_log.bytes_read / delta); - printf(" \n"); - printf("DRAM Total: %8.1f bytes/cycle\n", (float)(dram_delta_log.bytes_read + dram_delta_log.bytes_written) / delta); - printf("DRAM Write: %8.1f bytes/cycle\n", (float)dram_delta_log.bytes_written / delta); - printf("DRAM Read: %8.1f bytes/cycle\n", (float)dram_delta_log.bytes_read / delta); - printf(" \n"); - printf("SRAM Total: %8.1f bytes/cycle\n", (float)(sram_delta_log.bytes_read + sram_delta_log.bytes_written) / delta); - printf("SRAM Write: %8.1f bytes/cycle\n", (float)sram_delta_log.bytes_written / delta); - printf("SRAM Read: %8.1f bytes/cycle\n", (float)sram_delta_log.bytes_read / delta); - printf(" \n"); - printf(" L2$ Read: %8.1f bytes/cycle\n", (float)l2_delta_log.bytes_read / delta); - printf(" L1d$ Read: %8.1f bytes/cycle\n", (float)l1d_delta_log.bytes_read / delta); - printf(" \n"); - printf(" L2$ Hit Rate: %8.1f%%\n", 100.0 * l2_delta_log.hits / l2_delta_log.get_total()); - printf("L1d$ Hit Rate: %8.1f%%\n", 100.0 * l1d_delta_log.hits / l1d_delta_log.get_total()); - printf(" \n"); - }); - auto stop = std::chrono::high_resolution_clock::now(); - - cycles_t frame_cycles = simulator.current_cycle; - double frame_time = frame_cycles / clock_rate; - double simulation_time = std::chrono::duration_cast(stop - start).count() / 1000.0; - float total_power = dram.total_power(); - - tp_log.print_profile(dram._data_u8); - - dram.print_stats(4, frame_cycles); - print_header("DRAM"); - delta_log(dram_log, dram); - dram_log.print(frame_cycles); - - print_header("SRAM"); - delta_log(sram_log, sram); - sram_log.print(frame_cycles); - - print_header("Stream Scheduler"); - delta_log(ss_log, stream_scheduler); - ss_log.print(); - - print_header("Scene Buffer"); - delta_log(sb_log, scene_buffer); - sb_log.print(frame_cycles); - total_power += sb_log.print_power(scene_buffer_power_config, frame_time); - - print_header("L2$"); - delta_log(l2_log, l2); - l2_log.print(frame_cycles); - total_power += l2_log.print_power(l2_power_config, frame_time); - - print_header("L1d$"); - delta_log(l1d_log, l1ds); - l1d_log.print(frame_cycles); - total_power += l1d_log.print_power(l1d_power_config, frame_time); - - print_header("TP"); - delta_log(tp_log, tps); - tp_log.print(tps.size()); - - if(!rtcs.empty()) - { - print_header("RT Core"); - delta_log(rtc_log, rtcs); - rtc_log.print(frame_cycles, rtcs.size()); - } - - float total_energy = total_power * frame_time; - - print_header("Performance Summary"); - printf("Cycles: %lld\n", frame_cycles); - printf("Clock rate: %.0f MHz\n", clock_rate / 1'000'000.0); - printf("Frame time: %.3g ms\n", frame_time * 1000.0); - printf("Mrays/s: %.0f\n", (float)ss_log.rays / frame_time / 1'000'000); - - print_header("Power Summary"); - printf("Energy: %.2f mJ\n", total_energy * 1000.0); - printf("Power: %.2f W\n", total_power); - printf("Mrays/J: %.2f\n", (float)ss_log.rays / total_energy / 1'000'000); - - print_header("Simulation Summary"); - printf("Simulation rate: %.2f KHz\n", frame_cycles / simulation_time / 1000.0); - printf("Simulation time: %.0f s\n", simulation_time); - - print_header("Treelet Histogram"); - -#if 0 - uint treelet_counts[16]; - std::map treelet_histos[16]; - - for(uint i = 0; i < 16; ++i) - treelet_counts[i] = 0; - - for(auto& a : l1d_log.profile_counters) - { - if(a.first >= treelet_range.first && a.first < treelet_range.second) - { - uint treelet_id = (a.first - (paddr_t)kernel_args.treelets) / rtm::PackedTreelet::SIZE; - paddr_t treelet_addr = (paddr_t)kernel_args.treelets + treelet_id * rtm::PackedTreelet::SIZE; - rtm::PackedTreelet::Header header; - dram.direct_read(&header, sizeof(rtm::PackedTreelet::Header), treelet_addr); - uint offset = a.first - treelet_addr; - - treelet_histos[header.depth][offset] += a.second; - - if(offset == 64) - treelet_counts[header.depth]++; - } - } - - for(uint i = 0; i < 16; ++i) - { - if(treelet_counts[i]) - { - printf("Depth %d (%d)\n", i, treelet_counts[i]); - uint64_t total = 0; - total += treelet_histos[i][64]; - for(auto& a : treelet_histos[i]) - { - printf("\t%6d:%.2f(%.2f%%)\n", a.first / 64, (double)a.second / treelet_counts[i], 100.0 * a.second / total); - } - printf("\n"); - } - } -#endif - stbi_flip_vertically_on_write(true); - dram.dump_as_png_uint8((paddr_t)kernel_args.framebuffer, kernel_args.framebuffer_width, kernel_args.framebuffer_height, "out.png"); - - for(auto& tp : tps) delete tp; - for(auto& sfu : sfus) delete sfu; - for(auto& l1d : l1ds) delete l1d; - for(auto& thread_scheduler : thread_schedulers) delete thread_scheduler; - for(auto& rtc : rtcs) delete rtc; - for(auto& rsb : rsbs) delete rsb; -} - -} -} diff --git a/src/arches-v2/isa/riscv.cpp b/src/arches-v2/isa/riscv.cpp index a9ae7fa..82e00fb 100644 --- a/src/arches-v2/isa/riscv.cpp +++ b/src/arches-v2/isa/riscv.cpp @@ -1,6 +1,10 @@ #include "riscv.hpp" -#include +#if defined BUILD_PLATFORM_WINDOWS + #include +#elif defined BUILD_PLATFORM_LINUX + #include +#endif #include "errors.hpp" #include "util/bit-manipulation.hpp" @@ -203,7 +207,7 @@ InstructionInfo const isa_SYSTEM[2] = InstructionInfo(0b000000000001, "ebreak", InstrType::SYS, Encoding::I, RegFile::INT, EXEC_DECL { //break point - __debugbreak(); + add_breakpoint(); }), }; @@ -745,7 +749,7 @@ InstructionInfo const isa_OP_FSGNJ_FP[3] = //r.funct3 { float rs1 = unit->float_regs->registers[instr.r.rs1].f32; float rs2 = unit->float_regs->registers[instr.r.rs2].f32; - unit->float_regs->registers[instr.r.rd].f32 = copysignf(rs1, (signbit(rs1) != signbit(rs2)) ? -1.0f : 1.0f); + unit->float_regs->registers[instr.r.rd].f32 = copysignf(rs1, (std::signbit(rs1) != std::signbit(rs2)) ? -1.0f : 1.0f); }), }; diff --git a/src/arches-v2/main.cpp b/src/arches-v2/main.cpp index a71fc26..93f5435 100644 --- a/src/arches-v2/main.cpp +++ b/src/arches-v2/main.cpp @@ -5,6 +5,7 @@ #include "units/trax/unit-rt-core.hpp" #include "trax-kernel/include.hpp" #include "trax-kernel/intersect.hpp" +#include "units/unit-texture.hpp" namespace Arches { @@ -108,6 +109,26 @@ const static InstructionInfo isa_custom0_funct3[8] = for(uint i = 0; i < sizeof(rtm::Ray) / sizeof(float); ++i) ((float*)mem_req.data)[i] = fr[instr.i.rs1 + i].f32; + return mem_req; + }), + InstructionInfo(0x6, "sample2d", InstrType::CUSTOM3, Encoding::I, RegFile::FLOAT, MEM_REQ_DECL + { + Register32* fr = unit->float_regs->registers; + + MemoryRequest mem_req; + mem_req.type = MemoryRequest::Type::STORE; + mem_req.size = 8; + mem_req.dst.push(DstReg(instr.rd, RegType::FLOAT32).u9, 9); + mem_req.vaddr = fr[instr.i.rs1].u32; + + ((float*)mem_req.data)[0] = fr[instr.i.rs1 + 1].f32; + ((float*)mem_req.data)[1] = fr[instr.i.rs1 + 2].f32; + + //fr[instr.i.rd + 0].f32 = fr[instr.i.rs1 + 1].f32; + //fr[instr.i.rd + 1].f32 = fr[instr.i.rs1 + 2].f32; + //fr[instr.i.rd + 2].f32 = 0.5f; + //fr[instr.i.rd + 3].f32 = 1.0f; + return mem_req; }), }; @@ -130,8 +151,8 @@ static TRaXKernelArgs initilize_buffers(Units::UnitMainMemoryBase** drams, const { std::string scene_name = sim_config.get_string("scene-name"); std::string project_folder = get_project_folder_path(); - std::string datasets_folder = project_folder + "datasets\\"; - std::string cache_folder = project_folder + "datasets\\cache\\"; + std::string datasets_folder = project_folder + "datasets/"; + std::string cache_folder = project_folder + "datasets/cache/"; TRaXKernelArgs args; args.framebuffer_width = sim_config.get_int("framebuffer-width"); @@ -162,6 +183,8 @@ static TRaXKernelArgs initilize_buffers(Units::UnitMainMemoryBase** drams, const args.rays = write_vector(drams, xbar, 256, rays, heap_address); } + args.materials = write_vector(drams, xbar, 256, mesh.materials, heap_address); + args.nodes = write_vector(drams, xbar, 256, bvh.nodes, heap_address); #if USE_HECWBVH_V1 @@ -182,16 +205,19 @@ static TRaXKernelArgs initilize_buffers(Units::UnitMainMemoryBase** drams, const for(uint32_t i = 0; i < mesh.materials.size(); ++i) { + if(!mesh.materials[i].use_am) + continue; Texture2D& tex = mesh.materials[i].albedo_texture; Texture2D::Texel* dev_tex = write_array(drams, xbar, 256, tex.texels, tex.width * tex.height, heap_address); free(tex.texels); tex.texels = dev_tex; } - args.materials = write_vector(drams, xbar, 256, mesh.materials, heap_address); + paddr_t mat_addr = (paddr_t)args.materials; + args.materials = write_vector(drams, xbar, 256, mesh.materials, mat_addr); for(uint32_t i = 0; i < mesh.materials.size(); ++i) - mesh.materials[i].albedo_texture.texels = nullptr; + mesh.materials[i].albedo_texture.texels = nullptr; // to not free device memory textures size_t temp = TRAX_KERNEL_ARGS_ADDRESS; write_array(drams, xbar, 256, (uint8_t*)&args, sizeof(TRaXKernelArgs), temp); @@ -349,7 +375,7 @@ static void run_sim_trax(SimulationConfig& sim_config) //DRAM UnitDRAM::Configuration dram_config; - dram_config.config_path = project_folder_path + "build\\src\\arches-v2\\config-files\\gddr6_14000_config.yaml"; + dram_config.config_path = project_folder_path + "build/src/arches-v2/config-files/gddr6_14000_config.yaml"; dram_config.size = 1ull << 30; //1GB dram_config.clock_ratio = dram_clock / core_clock; dram_config.latency = 92; @@ -388,6 +414,8 @@ static void run_sim_trax(SimulationConfig& sim_config) UnitL1Cache::PowerConfig l1d_power_config; + Units::UnitTexture::Configuration tu_config; + UnitRTCore::Configuration rtc_config; rtc_config.max_rays = 64; rtc_config.num_cache_ports = 4; @@ -446,11 +474,12 @@ static void run_sim_trax(SimulationConfig& sim_config) UnitL1Cache::PowerConfig l1d_power_config; #endif - ELF elf(project_folder_path + "src\\trax-kernel\\riscv\\kernel"); + ELF elf(project_folder_path + "src/trax-kernel/riscv/kernel"); ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM0] = "FCHTHRD"; ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM1] = "BOXISECT"; ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM2] = "TRIISECT"; + ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM3] = "SAMPLE2D"; ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM7] = "TRACERAY"; ISA::RISCV::isa[ISA::RISCV::CUSTOM_OPCODE0] = ISA::RISCV::TRaX::custom0; @@ -461,6 +490,7 @@ static void run_sim_trax(SimulationConfig& sim_config) std::vector sfus; std::vector thread_schedulers; std::vector rtcs; + std::vector tus; std::vector l1ds; std::vector> unit_tables; unit_tables.reserve(num_tms); std::vector> sfu_lists; sfu_lists.reserve(num_tms); @@ -500,6 +530,9 @@ static void run_sim_trax(SimulationConfig& sim_config) TRaXKernelArgs kernel_args = initilize_buffers((Units::UnitMainMemoryBase**)drams.data(), xbar, heap_address, sim_config, partition_stride); heap_address = align_to(partition_stride, heap_address); + for(uint addr = 0; addr < (256 << 20); addr += partition_stride) + drams[xbar.get_partition(addr)]->direct_read(vec_mem.data() + addr, partition_stride, xbar.strip_partition_bits(addr)); + //bool warm_l2 = false; //if(warm_l2) //{ @@ -576,6 +609,16 @@ static void run_sim_trax(SimulationConfig& sim_config) //l1ss.push_back(new Units::UnitStreamCache(l1s_config)); //simulator.register_unit(l1ss.back()); + tu_config.num_clients = num_tps; + tu_config.cache = l1ds.back(); + tu_config.cache_port = num_tps + 1; + tu_config.cheat_mem = vec_mem.data(); + + tus.push_back(_new Units::UnitTexture(tu_config)); + simulator.register_unit(tus.back()); + mem_list.push_back(tus.back()); + unit_table[(uint)ISA::RISCV::InstrType::CUSTOM3] = tus.back(); + #if TRAX_USE_RT_CORE rtc_config.num_clients = num_tps; rtc_config.node_base_addr = (paddr_t)kernel_args.nodes; @@ -754,6 +797,7 @@ static void run_sim_trax(SimulationConfig& sim_config) int main(int argc, char* argv[]) { + Arches::set_full_exe_name(argv[0]); Arches::SimulationConfig sim_config(argc, argv); Arches::TRaX::run_sim_trax(sim_config); return 0; diff --git a/src/arches-v2/shared-utils.hpp b/src/arches-v2/shared-utils.hpp index 8ae23df..72bdddd 100644 --- a/src/arches-v2/shared-utils.hpp +++ b/src/arches-v2/shared-utils.hpp @@ -17,15 +17,26 @@ #include "isa/riscv.hpp" #include "rtm/rtm.hpp" -#include - +#if defined BUILD_PLATFORM_WINDOWS + #include + #define MAX_FILENAME_LENGTH MAX_PATH +#elif defined BUILD_PLATFORM_LINUX + #include + #define MAX_FILENAME_LENGTH FILENAME_MAX +#endif + +char full_exe_name[MAX_FILENAME_LENGTH]; namespace Arches { +void set_full_exe_name(const char *name) { + strcpy(full_exe_name, name); +} + std::string get_project_folder_path() { - CHAR path[MAX_PATH]; - GetModuleFileNameA(NULL, path, MAX_PATH); - std::string executable_path(path); + // CHAR path[MAX_PATH]; + // GetModuleFileNameA(NULL, path, MAX_PATH); + std::string executable_path(full_exe_name); return executable_path.substr(0, executable_path.rfind("build")); } @@ -132,6 +143,8 @@ const static std::vector scene_configs = {"intel-sponza", rtm::vec3(-900.6f, 150.8f, 120.74f), rtm::vec3(79.7f, 14.0f, -17.4f), 12.0f}, //INTEL SPONZA + {"sponza", rtm::vec3(0.0f, 2.0f, 0.0f), rtm::vec3(90.0f, 0.0f, -1.0f), 12.0f}, // SPONZA + {"san-miguel", rtm::vec3(7.448, 1.014, 12.357), rtm::vec3(8.056, 1.04, 11.563), 12.0f}, //SAN_MIGUEL {"hairball", rtm::vec3(0, 0, 10), rtm::vec3(0, 0, 0), 24.0f}, //HAIRBALL @@ -206,10 +219,10 @@ class SimulationConfig set_param("max-rays", 128); //Workload - set_param("scene-name", "intel-sponza"); + set_param("scene-name", "sponza"); set_param("framebuffer-width", 1024); set_param("framebuffer-height", 1024); - set_param("pregen-rays", 1); + set_param("pregen-rays", 0); set_param("pregen-bounce", 1); set_param("bvh-preset", 0); set_param("bvh-merging", 0); diff --git a/src/arches-v2/stdafx.hpp b/src/arches-v2/stdafx.hpp index 85befcb..4e08865 100644 --- a/src/arches-v2/stdafx.hpp +++ b/src/arches-v2/stdafx.hpp @@ -1,8 +1,14 @@ #pragma once -//Determine debug/release +//Determine platform #if defined _WIN16 || defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64 || defined __WIN32__ || defined __TOS_WIN__ || defined __WINDOWS__ #define BUILD_PLATFORM_WINDOWS +#elif defined __linux__ + #define BUILD_PLATFORM_LINUX +#endif + +//Determine debug/release +#ifdef BUILD_PLATFORM_WINDOWS #if defined _DEBUG || defined DEBUG #define BUILD_DEBUG @@ -108,6 +114,14 @@ #endif #define notimpl implerr +//To add breakpoints for debugging at runtime +#if defined BUILD_PLATFORM_WINDOWS + #define add_breakpoint() __debugbreak() +#elif defined BUILD_PLATFORM_LINUX + #include + #define add_breakpoint() raise(SIGINT) +#endif + //Language facts #if -2>>1 == -1 #define SHIFTRIGHT_ARITHMETIC //Pads with sign bit @@ -142,16 +156,17 @@ #include #include -#include -#include - - - +//Include platform intrinsics header +#if defined BUILD_PLATFORM_WINDOWS + #include +#elif defined BUILD_PLATFORM_LINUX + #include +#endif #ifndef _DEBUG inline void _assert(bool x) { - if(!x) __debugbreak(); + if(!x) add_breakpoint(); } #else inline void _assert(bool x) diff --git a/src/arches-v2/trax.hpp b/src/arches-v2/trax.hpp deleted file mode 100644 index ef695ec..0000000 --- a/src/arches-v2/trax.hpp +++ /dev/null @@ -1,730 +0,0 @@ -#pragma once - -#include "shared-utils.hpp" -#include "units/trax/unit-tp.hpp" -#include "units/trax/unit-rt-core.hpp" -#include "trax-kernel/include.hpp" -#include "trax-kernel/intersect.hpp" - -namespace Arches { - -namespace ISA { namespace RISCV { namespace TRaX { - -//see the opcode map for details -const static InstructionInfo isa_custom0_000_imm[8] = -{ - InstructionInfo(0x0, "fchthrd", InstrType::CUSTOM0, Encoding::U, RegFile::INT, MEM_REQ_DECL - { - MemoryRequest req; - req.type = MemoryRequest::Type::LOAD; - req.size = sizeof(uint32_t); - req.dst.push(DstReg(instr.rd, RegType::UINT32).u9, 9); - req.vaddr = 0x0ull; - return req; - }), - InstructionInfo(0x1, "boxisect", InstrType::CUSTOM1, Encoding::U, RegFile::FLOAT, EXEC_DECL - { - Register32 * fr = unit->float_regs->registers; - - rtm::Ray ray; - rtm::vec3 inv_d; - ray.o.x = fr[0].f32; - ray.o.y = fr[1].f32; - ray.o.z = fr[2].f32; - ray.t_min = fr[3].f32; - inv_d.x = fr[4].f32; - inv_d.y = fr[5].f32; - inv_d.z = fr[6].f32; - ray.t_max = fr[7].f32; - - rtm::AABB aabb; - aabb.min.x = fr[8].f32; - aabb.min.y = fr[9].f32; - aabb.min.z = fr[10].f32; - aabb.max.x = fr[11].f32; - aabb.max.y = fr[12].f32; - aabb.max.z = fr[13].f32; - - unit->float_regs->registers[instr.u.rd].f32 = rtm::intersect(aabb, ray, inv_d); - }), - InstructionInfo(0x2, "triisect", InstrType::CUSTOM2, Encoding::U, RegFile::FLOAT, EXEC_DECL - { - Register32 * fr = unit->float_regs->registers; - - rtm::Ray ray; - ray.o.x = fr[0].f32; - ray.o.y = fr[1].f32; - ray.o.z = fr[2].f32; - ray.t_min = fr[3].f32; - ray.d.x = fr[4].f32; - ray.d.y = fr[5].f32; - ray.d.z = fr[6].f32; - ray.t_max = fr[7].f32; - - rtm::Triangle tri; - tri.vrts[0].x = fr[8].f32; - tri.vrts[0].y = fr[9].f32; - tri.vrts[0].z = fr[10].f32; - tri.vrts[1].x = fr[11].f32; - tri.vrts[1].y = fr[12].f32; - tri.vrts[1].z = fr[13].f32; - tri.vrts[2].x = fr[14].f32; - tri.vrts[2].y = fr[15].f32; - tri.vrts[2].z = fr[16].f32; - - rtm::Hit hit; - hit.t = fr[17].f32; - hit.bc[0] = fr[18].f32; - hit.bc[1] = fr[19].f32; - hit.id = fr[20].u32; - - rtm::intersect(tri, ray, hit); - - fr[17].f32 = hit.t; - fr[18].f32 = hit.bc[0]; - fr[19].f32 = hit.bc[1]; - fr[20].u32 = hit.id; - }), -}; - -const static InstructionInfo isa_custom0_funct3[8] = -{ - InstructionInfo(0x0, META_DECL{return isa_custom0_000_imm[instr.u.imm_31_12 >> 3]; }), - InstructionInfo(0x1, IMPL_NONE), - InstructionInfo(0x2, IMPL_NONE), - InstructionInfo(0x3, IMPL_NONE), - InstructionInfo(0x4, IMPL_NONE), - InstructionInfo(0x5, "traceray", InstrType::CUSTOM7, Encoding::I, RegFile::FLOAT, MEM_REQ_DECL - { - MemoryRequest mem_req; - mem_req.type = MemoryRequest::Type::STORE; - mem_req.size = sizeof(rtm::Ray); - mem_req.dst.push(DstReg(instr.rd, RegType::FLOAT32).u9, 9); - mem_req.vaddr = 0xdeadbeefull; - - Register32* fr = unit->float_regs->registers; - for(uint i = 0; i < sizeof(rtm::Ray) / sizeof(float); ++i) - ((float*)mem_req.data)[i] = fr[instr.i.rs1 + i].f32; - - return mem_req; - }), -}; - -const static InstructionInfo custom0(CUSTOM_OPCODE0, META_DECL{return isa_custom0_funct3[instr.i.funct3];}); - -}}} - -namespace TRaX { - -typedef Units::UnitDRAMRamulator UnitDRAM; -typedef Units::UnitCache UnitL2Cache; -typedef Units::UnitCache UnitL1Cache; -typedef rtm::FTB PrimBlocks; -typedef Units::TRaX::UnitRTCore UnitRTCore; - -static TRaXKernelArgs initilize_buffers(uint8_t* main_memory, paddr_t& heap_address, const SimulationConfig& sim_config, uint page_size) -{ - std::string scene_name = sim_config.get_string("scene_name"); - std::string project_folder = get_project_folder_path(); - std::string datasets_folder = project_folder + "datasets\\"; - std::string cache_folder = project_folder + "datasets\\cache\\"; - - TRaXKernelArgs args; - args.framebuffer_width = sim_config.get_int("framebuffer_width"); - args.framebuffer_height = sim_config.get_int("framebuffer_height"); - args.framebuffer_size = args.framebuffer_width * args.framebuffer_height; - heap_address = align_to(page_size, heap_address); - args.framebuffer = reinterpret_cast(heap_address); - heap_address += args.framebuffer_size * sizeof(uint32_t); - - args.pregen_rays = sim_config.get_int("pregen_rays"); - uint pregen_bounce = sim_config.get_int("pregen_bounce"); - - args.light_dir = rtm::normalize(rtm::vec3(4.5f, 42.5f, 5.0f)); - args.camera = sim_config.camera; - - rtm::Mesh mesh(datasets_folder + scene_name + ".obj"); - rtm::CWBVH bvh(mesh, (cache_folder + scene_name + ".bvh").c_str()); - - std::vector rays(args.framebuffer_size); - if(args.pregen_rays) - { - std::string ray_file = scene_name + "-" + std::to_string(args.framebuffer_width) + "-" + std::to_string(pregen_bounce) + ".rays"; - #if USE_HECWBVH - pregen_rays(&bvh.nodes[0], &bvh.nodes[0].ftb, mesh, args.framebuffer_width, args.framebuffer_height, args.camera, pregen_bounce, rays); - #else - pregen_rays(&bvh.nodes[0], &bvh.ftbs[0], mesh, args.framebuffer_width, args.framebuffer_height, args.camera, pregen_bounce, rays); - #endif - args.rays = write_vector(main_memory, 256, rays, heap_address); - } - - args.nodes = write_vector(main_memory, 256, bvh.nodes, heap_address); - -#if USE_HECWBVH - args.ftbs = (rtm::FTB*)args.nodes; -#else - args.ftbs = write_vector(main_memory, 256, bvh.ftbs, heap_address); -#endif - - std::vector tris; - mesh.get_triangles(tris); - args.tris = write_vector(main_memory, 256, tris, heap_address); - - std::memcpy(main_memory + TRAX_KERNEL_ARGS_ADDRESS, &args, sizeof(TRaXKernelArgs)); - return args; -} - -static void run_sim_trax(SimulationConfig& sim_config) -{ - std::string project_folder_path = get_project_folder_path(); - -#if 0 //RTX 4090 ish - //Compute - double core_clock = 2235.0e6; - double dram_clock = 5250.0e6; - uint64_t stack_size = 512; - uint num_tms = 128; - uint num_tps = 128; - uint num_threads = 12; - - //Memory - uint64_t block_size = CACHE_BLOCK_SIZE; - uint num_partitions = 12; - uint partition_stride = 1 << 12; - - //DRAM - UnitDRAM::Configuration dram_config; - dram_config.config_path = project_folder_path + "build\\src\\arches-v2\\config-files\\gddr6x_21000_config.yaml"; - dram_config.size = 1ull << 30; //1GB per partition - dram_config.clock_ratio = dram_clock / core_clock; - dram_config.latency = 254; - - //L2$ - UnitL2Cache::Configuration l2_config; - l2_config.level = 2; - //l2_config.block_prefetch = true; - l2_config.miss_alloc = true; - l2_config.size = 6 << 20; - l2_config.associativity = 16; - l2_config.policy = Units::UnitCacheBase::Policy::LRU; - l2_config.num_slices = 6; - l2_config.crossbar_width = l2_config.num_slices; - l2_config.num_mshr = 256; - l2_config.num_subentries = 4; - l2_config.latency = 187; - - UnitL2Cache::PowerConfig l2_power_config; - - Units::UnitCrossbar::Configuration xbar_config; - xbar_config.num_slices = l2_config.num_slices; - xbar_config.slice_stride = l2_config.block_size; - xbar_config.num_partitions = num_partitions; - xbar_config.partition_stride = partition_stride; - - //L1d$ - UnitL1Cache::Configuration l1d_config; - l1d_config.level = 1; - l1d_config.miss_alloc = true; - l1d_config.size = 128 << 10; - l1d_config.associativity = 32; - l1d_config.policy = Units::UnitCacheBase::Policy::LRU; - l1d_config.num_banks = 4; - l1d_config.crossbar_width = l1d_config.num_banks; - l1d_config.num_mshr = 512; - l1d_config.num_subentries = 16; - l1d_config.latency = 39; - - UnitL1Cache::PowerConfig l1d_power_config; - - UnitRTCore::Configuration rtc_config; - rtc_config.max_rays = 32; - rtc_config.num_cache_ports = 4; - -#elif 0 //RTX 3070 ish - //Compute - double core_clock = 1500.0e6; - uint64_t stack_size = 512; - uint num_tms = 46; - uint num_tps = 128; - uint num_threads = 12; - - //Memory - double dram_clock = 3500.0e6; - uint num_partitions = 8; - uint partition_stride = 1 << 12; - - //DRAM - UnitDRAM::Configuration dram_config; - dram_config.config_path = project_folder_path + "build\\src\\arches-v2\\config-files\\gddr6_14000_config.yaml"; - dram_config.size = 1ull << 30; //1GB per partition - dram_config.clock_ratio = dram_clock / core_clock; - dram_config.latency = 254; - - //L2$ - UnitL2Cache::Configuration l2_config; - l2_config.level = 2; - l2_config.miss_alloc = true; - l2_config.size = 512 << 10; - l2_config.associativity = 16; - l2_config.policy = Units::UnitCacheBase::Policy::LRU; - l2_config.num_slices = 4; - l2_config.crossbar_width = l2_config.num_slices; - l2_config.num_mshr = 192; - l2_config.num_subentries = 4; - l2_config.latency = 187; - - UnitL2Cache::PowerConfig l2_power_config; - - Units::UnitCrossbar::Configuration xbar_config; - xbar_config.num_slices = l2_config.num_slices; - xbar_config.slice_stride = l2_config.block_size; - xbar_config.num_partitions = num_partitions; - xbar_config.partition_stride = partition_stride; - - //L1d$ - UnitL1Cache::Configuration l1d_config; - l1d_config.level = 1; - l1d_config.miss_alloc = true; - l1d_config.size = 128 << 10; - l1d_config.associativity = 32; - l1d_config.policy = Units::UnitCacheBase::Policy::FIFO; - l1d_config.num_banks = 4; - l1d_config.crossbar_width = l1d_config.num_banks; - l1d_config.num_mshr = 384; - l1d_config.num_subentries = 48; - l1d_config.latency = 39; - - UnitL1Cache::PowerConfig l1d_power_config; - - UnitRTCore::Configuration rtc_config; - rtc_config.max_rays = 128; - rtc_config.num_cache_ports = 4; - -#elif 1 //Turing spec -#if 1 //RTX 2080 - double core_clock = 1515.0e6; - uint num_threads = 8; - uint num_tps = 64; - uint num_tms = 46; - uint64_t stack_size = 1024; - - double dram_clock = 3500.0e6; - uint num_partitions = 8; - uint partition_stride = 1 << 12; -#else //RTX 2060 - double core_clock = 1365.0e6; - uint num_threads = 1; - uint num_tps = 64; - uint num_tms = 30; - uint64_t stack_size = 512; - - double dram_clock = 3500.0e6; - uint num_partitions = 6; - uint partition_stride = 1 << 12; -#endif - - //DRAM - UnitDRAM::Configuration dram_config; - dram_config.config_path = project_folder_path + "build\\src\\arches-v2\\config-files\\gddr6_14000_config.yaml"; - dram_config.size = 1ull << 30; //1GB - dram_config.clock_ratio = dram_clock / core_clock; - dram_config.latency = 92; - - //L2$ - UnitL2Cache::Configuration l2_config; - l2_config.level = 2; - l2_config.miss_alloc = true; - l2_config.size = 512 << 10; - l2_config.associativity = 16; - l2_config.num_slices = 4; - l2_config.crossbar_width = l2_config.num_slices; - l2_config.num_mshr = 192; - l2_config.num_subentries = 4; - l2_config.latency = 160; - - UnitL2Cache::PowerConfig l2_power_config; - - Units::UnitCrossbar::Configuration xbar_config; - xbar_config.num_slices = l2_config.num_slices; - xbar_config.slice_stride = l2_config.block_size; - xbar_config.num_partitions = num_partitions; - xbar_config.partition_stride = partition_stride; - - //L1d$ - UnitL1Cache::Configuration l1d_config; - l1d_config.level = 1; - l1d_config.miss_alloc = true; - l1d_config.size = 64 << 10; - l1d_config.associativity = 32; - l1d_config.num_banks = 16; - l1d_config.crossbar_width = l1d_config.num_banks; - l1d_config.num_mshr = 256; - l1d_config.num_subentries = 16; - l1d_config.latency = 20; - - UnitL1Cache::PowerConfig l1d_power_config; - - UnitRTCore::Configuration rtc_config; - rtc_config.max_rays = 64; - rtc_config.num_cache_ports = 2; -#else //TRaX 1.0 - double core_clock = 1000.0e6; - uint num_threads = 1; - uint num_tps = 32; - uint num_tms = 32; - uint num_rays = 32; - uint64_t stack_size = 4096; - - double dram_clock = 2000.0e6; - uint num_partitions = 4; - uint partition_stride = 1 << 12; - - //DRAM - UnitDRAM::Configuration dram_config; - dram_config.config_path = project_folder_path + "build\\src\\arches-v2\\config-files\\gddr6_pch_config.yaml"; - dram_config.size = 1ull << 30; //1GB - dram_config.clock_ratio = dram_clock / core_clock; - dram_config.latency = 1; - //dram_config.latency = 56; - - //L2$ - UnitL2Cache::Configuration l2_config; - l2_config.level = 2; - l2_config.miss_alloc = false; - l2_config.size = 256 << 10; - l2_config.associativity = 16; - l2_config.num_slices = 1; - l2_config.crossbar_width = l2_config.num_slices; - l2_config.num_mshr = 1; - l2_config.num_subentries = 1; - l2_config.latency = 10; - - UnitL2Cache::PowerConfig l2_power_config; - - Units::UnitCrossbar::Configuration xbar_config; - xbar_config.num_slices = l2_config.num_slices; - xbar_config.slice_stride = l2_config.block_size; - xbar_config.num_partitions = num_partitions; - xbar_config.partition_stride = partition_stride; - - //L1d$ - UnitL1Cache::Configuration l1d_config; - l1d_config.level = 1; - l1d_config.miss_alloc = false; - l1d_config.size = 32 << 10; - l1d_config.associativity = 4; - l1d_config.num_banks = 8; - l1d_config.crossbar_width = l1d_config.num_banks; - l1d_config.num_mshr = 256; - l1d_config.num_subentries = 16; - l1d_config.latency = 1; - - UnitL1Cache::PowerConfig l1d_power_config; -#endif - - ELF elf(project_folder_path + "src\\trax-kernel\\riscv\\kernel"); - - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM0] = "FCHTHRD"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM1] = "BOXISECT"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM2] = "TRIISECT"; - ISA::RISCV::InstructionTypeNameDatabase::get_instance()[ISA::RISCV::InstrType::CUSTOM7] = "TRACERAY"; - ISA::RISCV::isa[ISA::RISCV::CUSTOM_OPCODE0] = ISA::RISCV::TRaX::custom0; - - uint num_sfus = static_cast(ISA::RISCV::InstrType::NUM_TYPES) * num_tms; - - Simulator simulator; - std::vector tps; - std::vector sfus; - std::vector thread_schedulers; - std::vector rtcs; - std::vector l1ds; - std::vector> unit_tables; unit_tables.reserve(num_tms); - std::vector> sfu_lists; sfu_lists.reserve(num_tms); - std::vector> mem_lists; mem_lists.reserve(num_tms); - - //construct memory partitions - std::vector drams; - std::vector l2s; - dram_config.num_ports = l2_config.num_slices; - l2_config.num_ports = l2_config.num_slices; - for(uint i = 0; i < num_partitions; ++i) - { - drams.push_back(_new UnitDRAM(dram_config)); - simulator.register_unit(drams.back()); - - l2_config.mem_higher_port = 0; - l2_config.mem_highers = {drams.back()}; - l2s.push_back(_new UnitL2Cache(l2_config)); - simulator.register_unit(l2s.back()); - simulator.new_unit_group(); - - xbar_config.mem_highers.push_back(l2s.back()); - } - - xbar_config.num_clients = num_tms; - Units::UnitCrossbar xbar(xbar_config); - simulator.register_unit(&xbar); - simulator.new_unit_group(); - - uint8_t* device_mem = (uint8_t*)malloc(3 << 29); - paddr_t heap_address = elf.load(device_mem); - TRaXKernelArgs kernel_args = initilize_buffers(device_mem, heap_address, sim_config, partition_stride); - heap_address = align_to(partition_stride, heap_address); - - for(uint addr = 0; addr < heap_address; addr += partition_stride) - drams[xbar.get_partition(addr)]->direct_write(device_mem + addr, partition_stride, xbar.strip_partition_bits(addr)); - - bool warm_l2 = false; - if(warm_l2) - { - paddr_t start = (paddr_t)kernel_args.nodes & ~(1 - partition_stride); - paddr_t end = start + l2_config.size * num_partitions; - for(paddr_t block_addr = end - l2_config.block_size; block_addr >= start; block_addr -= l2_config.block_size) - l2s[xbar.get_partition(block_addr)]->direct_write(xbar.strip_partition_bits(block_addr), device_mem + block_addr); - } - - bool deserialize_l2 = false, serialize_l2 = !deserialize_l2; - if(deserialize_l2) - for(uint i = 0; i < num_partitions; ++i) - serialize_l2 = !l2s[i]->deserialize("l2-p" + std::to_string(i) + ".bin", *drams[i]); - - Units::UnitAtomicRegfile atomic_regs(num_tms); - simulator.register_unit(&atomic_regs); - simulator.new_unit_group(); - - l1d_config.num_ports = num_tps; - l1d_config.mem_highers = {&xbar}; -#if TRAX_USE_RT_CORE - l1d_config.num_ports += num_tps; - l1d_config.crossbar_width *= 2; -#endif - - for(uint tm_index = 0; tm_index < num_tms; ++tm_index) - { - std::vector unit_table((uint)ISA::RISCV::InstrType::NUM_TYPES, nullptr); - std::vector mem_list; - std::vector sfu_list; - - l1d_config.mem_higher_port = tm_index; - l1ds.push_back(new UnitL1Cache(l1d_config)); - simulator.register_unit(l1ds.back()); - mem_list.push_back(l1ds.back()); - unit_table[(uint)ISA::RISCV::InstrType::LOAD] = l1ds.back(); - unit_table[(uint)ISA::RISCV::InstrType::STORE] = l1ds.back(); - - thread_schedulers.push_back(_new Units::UnitThreadScheduler(num_tps, tm_index, &atomic_regs, 32)); - simulator.register_unit(thread_schedulers.back()); - mem_list.push_back(thread_schedulers.back()); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM0] = thread_schedulers.back(); - - //sfu_list.push_back(_new Units::UnitSFU(num_tps_per_tm, 2, 1, num_tps_per_tm)); - //simulator.register_unit(sfu_list.back()); - //unit_table[(uint)ISA::RISCV::InstrType::FADD] = sfu_list.back(); - //unit_table[(uint)ISA::RISCV::InstrType::FMUL] = sfu_list.back(); - //unit_table[(uint)ISA::RISCV::InstrType::FFMAD] = sfu_list.back(); - - sfu_list.push_back(_new Units::UnitSFU(num_tps / 8, 1, 1, num_tps)); - simulator.register_unit(sfu_list.back()); - unit_table[(uint)ISA::RISCV::InstrType::IMUL] = sfu_list.back(); - unit_table[(uint)ISA::RISCV::InstrType::IDIV] = sfu_list.back(); - - sfu_list.push_back(_new Units::UnitSFU(num_tps / 16, 6, 1, num_tps)); - simulator.register_unit(sfu_list.back()); - unit_table[(uint)ISA::RISCV::InstrType::FDIV] = sfu_list.back(); - unit_table[(uint)ISA::RISCV::InstrType::FSQRT] = sfu_list.back(); - - #if TRAX_USE_HARDWARE_INTERSECTORS - sfu_list.push_back(_new Units::UnitSFU(2, 3, 1, num_tps_per_tm)); - simulator.register_unit(sfu_list.back()); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM1] = sfu_list.back(); - - sfu_list.push_back(_new Units::UnitSFU(1, 22, 8, num_tps_per_tm)); - simulator.register_unit(sfu_list.back()); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM2] = sfu_list.back(); - #endif - - for(auto& sfu : sfu_list) - sfus.push_back(sfu); - - //l1s_config.mem_higher_port = tm_index * 2 + 1; - //l1ss.push_back(new Units::UnitStreamCache(l1s_config)); - //simulator.register_unit(l1ss.back()); - - #if TRAX_USE_RT_CORE - rtc_config.num_clients = num_tps; - rtc_config.node_base_addr = (paddr_t)kernel_args.nodes; - rtc_config.tri_base_addr = (paddr_t)kernel_args.ftbs; - rtc_config.cache = l1ds.back(); - rtc_config.cache_port = num_tps; - rtc_config.cache_port_stride = num_tps / l1d_config.num_banks; - - rtcs.push_back(_new UnitRTCore(rtc_config)); - simulator.register_unit(rtcs.back()); - mem_list.push_back(rtcs.back()); - unit_table[(uint)ISA::RISCV::InstrType::CUSTOM7] = rtcs.back(); - #endif - - unit_tables.emplace_back(unit_table); - sfu_lists.emplace_back(sfu_list); - mem_lists.emplace_back(mem_list); - - Units::UnitTP::Configuration tp_config; - tp_config.tm_index = tm_index; - tp_config.stack_size = stack_size; - tp_config.cheat_memory = device_mem; - tp_config.unique_mems = &mem_lists.back(); - tp_config.unique_sfus = &sfu_lists.back(); - tp_config.num_threads = num_threads; - for(uint tp_index = 0; tp_index < num_tps; ++tp_index) - { - tp_config.tp_index = tp_index; - tp_config.unit_table = &unit_tables.back(); - tps.push_back(new Units::TRaX::UnitTP(tp_config)); - simulator.register_unit(tps.back()); - } - - simulator.new_unit_group(); - } - - printf("Starting TRaX\n"); - for(auto& tp : tps) - tp->set_entry_point(elf.elf_header->e_entry.u64); - - //master logs - UnitDRAM::Log dram_log; - UnitL2Cache::Log l2_log; - UnitL1Cache::Log l1d_log; - Units::UnitTP::Log tp_log; - - UnitRTCore::Log rtc_log; - - uint delta = sim_config.get_int("logging_interval"); - float delta_s = delta / core_clock; - float delta_ns = delta_s * 1e9; - float delta_dram_cycles = delta_s * dram_clock; - float peak_dram_bandwidth = dram_clock / core_clock * num_partitions * 4 * 32 / 8; - float peak_l2_bandwidth = num_partitions * l2_config.num_slices * MemoryRequest::MAX_SIZE; - float peak_l1d_bandwidth = num_tms * l1d_config.num_banks * MemoryRequest::MAX_SIZE; - - //peak_dram_bandwidth = 32 * core_clock * num_partitions; - //peak_l2_bandwidth = 32 * num_tms / 2; - - auto start = std::chrono::high_resolution_clock::now(); - simulator.execute(delta, [&]() -> void - { - UnitDRAM::Log dram_delta_log = delta_log(dram_log, drams); - UnitL2Cache::Log l2_delta_log = delta_log(l2_log, l2s); - UnitL1Cache::Log l1d_delta_log = delta_log(l1d_log, l1ds); - UnitRTCore::Log rtc_delta_log = delta_log(rtc_log, rtcs); - - double simulation_time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start).count() / 1000.0; - - printf(" \n"); - printf("Cycle: %lld \n", simulator.current_cycle); - printf("Threads Launched: %d \n", atomic_regs.iregs[0]); - printf("Simulation rate: %.2f KHz\n", simulator.current_cycle / simulation_time / 1000.0); - printf(" \n"); - printf("DRAM Read: %8.1f GB/s (%.2f%%)\n", (float)dram_delta_log.bytes_read / delta_ns, 100.0f * dram_delta_log.bytes_read / delta / peak_dram_bandwidth); - printf(" L2$ Read: %8.1f B/clk (%.2f%%)\n", (float)l2_delta_log.bytes_read / delta, 100.0f * l2_delta_log.bytes_read / delta / peak_l2_bandwidth); - printf("L1d$ Read: %8.1f B/clk (%.2f%%)\n", (float)l1d_delta_log.bytes_read / delta, 100.0 * l1d_delta_log.bytes_read / delta / peak_l1d_bandwidth); - printf(" \n"); - printf(" L2$ Hit/Half/Miss: %3.1f%%/%3.1f%%/%3.1f%%\n", 100.0 * l2_delta_log.hits / l2_delta_log.get_total(), 100.0 * l2_delta_log.half_misses / l2_delta_log.get_total(), 100.0 * l2_delta_log.misses / l2_delta_log.get_total()); - printf("L1d$ Hit/Half/Miss: %3.1f%%/%3.1f%%/%3.1f%%\n", 100.0 * l1d_delta_log.hits / l1d_delta_log.get_total(), 100.0 * l1d_delta_log.half_misses / l1d_delta_log.get_total(), 100.0 * l1d_delta_log.misses / l1d_delta_log.get_total()); - printf(" \n"); - printf(" L2$ Stalls: %0.2f%%\n", 100.0 * l2_delta_log.mshr_stalls / num_partitions / l2_config.num_slices / l2_config.num_banks / delta); - printf("L1d$ Stalls: %0.2f%%\n", 100.0 * l1d_delta_log.mshr_stalls / num_tms / l1d_config.num_banks / delta); - printf(" \n"); - printf("L2$ Occ: %0.2f%%\n", 100.0 * l2_delta_log.get_total() / num_partitions / l2_config.num_slices / l2_config.num_banks / delta); - printf("L1d$ Occ: %0.2f%%\n", 100.0 * l1d_delta_log.get_total() / num_tms / l1d_config.num_banks / delta); - printf(" \n"); - if(!rtcs.empty()) - { - printf("MRays/s: %.0f\n\n", rtc_delta_log.rays / delta_ns * 1000.0); - rtc_delta_log.print(rtcs.size()); - } - }); - - auto stop = std::chrono::high_resolution_clock::now(); - - for(uint addr = 0; addr < heap_address; addr += partition_stride) - drams[xbar.get_partition(addr)]->direct_read(device_mem + addr, partition_stride, xbar.strip_partition_bits(addr)); - - if(serialize_l2) - for(uint i = 0; i < num_partitions; ++i) - l2s[i]->serialize("l2-p" + std::to_string(i) + ".bin"); - - cycles_t frame_cycles = simulator.current_cycle; - double frame_time = frame_cycles / core_clock; - double frame_time_ns = frame_time * 1e9; - double simulation_time = std::chrono::duration_cast(stop - start).count() / 1000.0; - - tp_log.print_profile(device_mem); - - float total_power = 0.0f; - for(auto& dram : drams) - { - dram->print_stats(4, frame_cycles); - total_power += dram->total_power(); - } - print_header("DRAM"); - delta_log(dram_log, drams); - printf("DRAM Read: %.1f GB/s (%.2f%%)\n", (float)dram_log.bytes_read / frame_time_ns, 100.0f * dram_log.bytes_read / frame_cycles / peak_dram_bandwidth); - dram_log.print(frame_cycles); - - print_header("L2$"); - delta_log(l2_log, l2s); - printf(" L2$ Read: %.1f B/clk (%.2f%%)\n", (float)l2_log.bytes_read / frame_cycles, 100.0f * l2_log.bytes_read / frame_cycles / peak_l2_bandwidth); - l2_log.print(frame_cycles); - total_power += l2_log.print_power(l2_power_config, frame_time); - - print_header("L1d$"); - delta_log(l1d_log, l1ds); - printf("L1d$ Read: %.1f B/clk (%.2f%%)\n", (float)l1d_log.bytes_read / frame_cycles, 100.0 * l1d_log.bytes_read / frame_cycles / peak_l1d_bandwidth); - l1d_log.print(frame_cycles); - total_power += l1d_log.print_power(l1d_power_config, frame_time); - - print_header("TP"); - delta_log(tp_log, tps); - tp_log.print(tps.size()); - - if(!rtcs.empty()) - { - print_header("RT Core"); - delta_log(rtc_log, rtcs); - rtc_log.print(rtcs.size()); - } - - float total_energy = total_power * frame_time; - - print_header("Performance Summary"); - printf("Cycles: %lld\n", simulator.current_cycle); - printf("Clock rate: %.0f MHz\n", core_clock / 1'000'000.0); - printf("Frame time: %.3g ms\n", frame_time * 1000.0); - if(!rtcs.empty()) printf("MRays/s: %.0f\n", rtc_log.rays / frame_time / 1'000'000.0); - else printf("MRays/s: %.0f\n", kernel_args.framebuffer_size / frame_time / 1'000'000.0); - - print_header("Power Summary"); - printf("Energy: %.2f mJ\n", total_power * frame_time * 1000.0); - printf("Power: %.2f W\n", total_power); - if(!rtcs.empty()) printf("MRays/J: %.2f\n", rtc_log.rays / total_energy / 1'000'000.0); - else printf("MRays/J: %.2f\n", kernel_args.framebuffer_size / total_energy / 1'000'000.0); - - print_header("Simulation Summary"); - printf("Simulation rate: %.2f KHz\n", simulator.current_cycle / simulation_time / 1000.0); - printf("Simulation time: %.0f s\n", simulation_time); - printf("MSIPS: %.2f\n", simulator.current_cycle * tps.size() / simulation_time / 1'000'000.0); - - stbi_flip_vertically_on_write(true); - stbi_write_png("out.png", (int)kernel_args.framebuffer_width, (int)kernel_args.framebuffer_height, 4, device_mem + (size_t)kernel_args.framebuffer, 0); - free(device_mem); - - for(auto& tp : tps) delete tp; - for(auto& sfu : sfus) delete sfu; - for(auto& l1d : l1ds) delete l1d; - for(auto& thread_scheduler : thread_schedulers) delete thread_scheduler; - for(auto& rtc : rtcs) delete rtc; - for(auto& l2 : l2s) delete l2; - for(auto& dram : drams) delete dram; -} -} -} \ No newline at end of file diff --git a/src/arches-v2/units/trax/unit-tp.hpp b/src/arches-v2/units/trax/unit-tp.hpp index e2c9ffc..851bc80 100644 --- a/src/arches-v2/units/trax/unit-tp.hpp +++ b/src/arches-v2/units/trax/unit-tp.hpp @@ -31,6 +31,16 @@ class UnitTP : public Arches::Units::UnitTP if(float_regs_pending[i]) return float_regs_pending[i]; } + else if(instr_info.instr_type == ISA::RISCV::InstrType::CUSTOM3) //SAMPLE2D + { + for(uint i = 0; i < 3; ++i) + if(float_regs_pending[instr.rs1 + i]) + return float_regs_pending[instr.rs1 + i]; + + for(uint i = 0; i < 4; ++i) + if(float_regs_pending[instr.rd + i]) + return float_regs_pending[instr.rd + i]; + } else if(instr_info.instr_type == ISA::RISCV::InstrType::CUSTOM7) //TRACE RAY { for(uint i = 0; i < sizeof(rtm::Ray) / sizeof(float); ++i) diff --git a/src/arches-v2/units/unit-sfu.hpp b/src/arches-v2/units/unit-sfu.hpp index d6fb97d..04c0f7d 100644 --- a/src/arches-v2/units/unit-sfu.hpp +++ b/src/arches-v2/units/unit-sfu.hpp @@ -54,7 +54,7 @@ class UnitSFU : public UnitBase return return_crossbar.peek(port_index); } - virtual const SFURequest& read_return(uint port_index) + virtual const SFURequest read_return(uint port_index) { return return_crossbar.read(port_index); } diff --git a/src/arches-v2/units/unit-stream-cache.hpp b/src/arches-v2/units/unit-stream-cache.hpp index b481746..f4ef09d 100644 --- a/src/arches-v2/units/unit-stream-cache.hpp +++ b/src/arches-v2/units/unit-stream-cache.hpp @@ -35,10 +35,13 @@ class UnitStreamCache : public UnitCacheBase struct PowerConfig { //Energy is joules, power in watts - float tag_energy{0.0f}; - float read_energy{0.0f}; - float write_energy{0.0f}; - float leakage_power{0.0f}; + float tag_energy; + float read_energy; + float write_energy; + float leakage_power; + + PowerConfig() : + tag_energy(0.0f), read_energy(0.0f), write_energy(0.0f), leakage_power(0.0f) {} }; UnitStreamCache(Configuration config); diff --git a/src/arches-v2/units/unit-texture.cpp b/src/arches-v2/units/unit-texture.cpp new file mode 100644 index 0000000..b79a188 --- /dev/null +++ b/src/arches-v2/units/unit-texture.cpp @@ -0,0 +1,79 @@ +#include "unit-texture.hpp" + +namespace Arches { namespace Units { + +void UnitTexture::clock_rise() +{ + _request_network.clock(); + + if(_filter_pipline.is_write_valid() && cache->return_port_read_valid(cache_port)) + { + MemoryReturn ret = cache->read_return(cache_port); + Sample& sample = _pending_samples[ret.dst.raw]; + _assert(sample.texel_addrs[0] == ret.paddr); + std::memcpy(&sample.texels[0], ret.data, sizeof(Texture2D::Texel)); + sample.pending_texels--; + + if(sample.pending_texels == 0) + _filter_pipline.write(sample); + } + + if(_request_network.is_read_valid(0)) + { + MemoryRequest req = _request_network.read(0); + req.dst.push(req.port, 8); + + Sample& sample = _pending_samples[req.dst.raw]; + sample.req = req; + + //nearest + sample.pending_texels = 1; + + rtm::vec2 uv; + std::memcpy(&uv, req.data, sizeof(rtm::vec2)); + Texture2D& texture = *(Texture2D*)&_cheat_mem[req.vaddr]; + + rtm::uvec2 iuv = texture.get_iuv(uv); + Texture2D::Texel* texel = texture.get_texel_addr(iuv); + sample.texel_addrs[0] = (paddr_t)texel; + + MemoryRequest texel_req; + texel_req.type = MemoryRequest::Type::LOAD; + texel_req.paddr = sample.texel_addrs[0]; + texel_req.size = sizeof(Texture2D::Texel); + texel_req.port = cache_port; + texel_req.dst = req.dst; + _texel_fill_queue.push(texel_req); + } + +} + +void UnitTexture::clock_fall() +{ + _filter_pipline.clock(); + + if(_filter_pipline.is_read_valid() && _return_network.is_write_valid(0)) + { + Sample sample = _filter_pipline.read(); + + MemoryReturn ret2(sample.req); + ret2.size = sizeof(rtm::vec4); + ret2.dst.pop(8); + + rtm::vec4 color = Texture2D::decode_texel(sample.texels[0]); + std::memcpy(ret2.data, &color, sizeof(rtm::vec4)); + + _return_network.write(ret2, 0); + } + + if(_texel_fill_queue.size() && cache->request_port_write_valid(cache_port)) + { + cache->write_request(_texel_fill_queue.front()); + _texel_fill_queue.pop(); + } + + _return_network.clock(); +} + +} +} \ No newline at end of file diff --git a/src/arches-v2/units/unit-texture.hpp b/src/arches-v2/units/unit-texture.hpp new file mode 100644 index 0000000..fee46b7 --- /dev/null +++ b/src/arches-v2/units/unit-texture.hpp @@ -0,0 +1,79 @@ +#pragma once +#include "stdafx.hpp" + +#include "unit-memory-base.hpp" + +#include "util/arbitration.hpp" + +namespace Arches { +namespace Units { + +class UnitTexture : public UnitMemoryBase +{ +public: + struct Configuration + { + uint num_clients{1}; + uint8_t* cheat_mem{nullptr}; + UnitMemoryBase* cache{nullptr}; + uint cache_port{0}; + }; + +private: + UnitMemoryBase* cache; + uint cache_port; + + Cascade> _request_network; + ReturnCascade _return_network; + + uint8_t* _cheat_mem; + + struct Sample + { + paddr_t texel_addrs[8]; + Texture2D::Texel texels[8]; + uint32_t pending_texels; + MemoryRequest req; + }; + + std::unordered_map _pending_samples; + std::queue _texel_fill_queue; + LatencyFIFO _filter_pipline; + +public: + UnitTexture(const Configuration& config) : UnitMemoryBase(), cache(config.cache), cache_port(config.cache_port), _cheat_mem(config.cheat_mem), + _filter_pipline(10), _request_network(config.num_clients, 1), _return_network(1, config.num_clients) + { + } + + void clock_rise() override; + void clock_fall() override; + + bool request_port_write_valid(uint port_index) override + { + return _request_network.is_write_valid(port_index); + } + + void write_request(const MemoryRequest& request) override + { + _request_network.write(request, request.port); + } + + bool return_port_read_valid(uint port_index) override + { + return _return_network.is_read_valid(port_index); + } + + const MemoryReturn& peek_return(uint port_index) override + { + return _return_network.peek(port_index); + } + + const MemoryReturn read_return(uint port_index) override + { + return _return_network.read(port_index); + } +}; + +} +} \ No newline at end of file diff --git a/src/arches-v2/util/alignment-allocator.hpp b/src/arches-v2/util/alignment-allocator.hpp index 0a2a1ae..44bcb7f 100644 --- a/src/arches-v2/util/alignment-allocator.hpp +++ b/src/arches-v2/util/alignment-allocator.hpp @@ -35,12 +35,20 @@ class AlignmentAllocator inline pointer allocate(size_type n) { - return (pointer)_aligned_malloc(n * sizeof(value_type), N); + #if defined BUILD_PLATFORM_WINDOWS + return (pointer)_aligned_malloc(n * sizeof(value_type), N); + #elif defined BUILD_PLATFORM_LINUX + return (pointer)aligned_alloc(N, n * sizeof(value_type)); + #endif } inline void deallocate(pointer p, size_type) { - _aligned_free(p); + #if defined BUILD_PLATFORM_WINDOWS + _aligned_free(p); + #elif defined BUILD_PLATFORM_LINUX + free(p); + #endif } inline void construct(pointer p, const value_type& wert) diff --git a/src/arches-v2/util/bit-manipulation.hpp b/src/arches-v2/util/bit-manipulation.hpp index 9792f69..a5da0c5 100644 --- a/src/arches-v2/util/bit-manipulation.hpp +++ b/src/arches-v2/util/bit-manipulation.hpp @@ -1,6 +1,15 @@ #pragma once #include "stdafx.hpp" +//cross platform intrinsics +#if defined BUILD_PLATFORM_WINDOWS + #define popcnt64(mask) __popcnt64(mask) + #define rotr64(mask, n) _rotr64(mask, n) +#elif defined BUILD_PLATFORM_LINUX + #define popcnt64(mask) _popcnt64(mask) + #define rotr64(mask, n) _lrotr(mask, n) +#endif + inline uint log2i(uint64_t in) { uint i = 0; @@ -34,12 +43,12 @@ inline uint clz(uint64_t mask) inline uint popcnt(uint64_t mask) { - return __popcnt64(mask); + return popcnt64(mask); } inline uint64_t rotr(uint64_t mask, uint n) { - return _rotr64(mask, n); + return rotr64(mask, n); } inline uint64_t pdep(uint64_t data, uint64_t mask) diff --git a/src/arches-v2/util/file.cpp b/src/arches-v2/util/file.cpp index dc15a03..3bfa7d0 100644 --- a/src/arches-v2/util/file.cpp +++ b/src/arches-v2/util/file.cpp @@ -5,6 +5,7 @@ #include #include #else + #include #include #include #endif diff --git a/src/arches-v2/util/stbi.cpp b/src/arches-v2/util/stbi.cpp index 35594c4..7a0eb89 100644 --- a/src/arches-v2/util/stbi.cpp +++ b/src/arches-v2/util/stbi.cpp @@ -1,7 +1,10 @@ //stb image +#include "stdafx.hpp" #define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_WRITE_IMPLEMENTATION -#define STBI_MSC_SECURE_CRT +#ifdef BUILD_PLATFORM_WINDOWS + #define STBI_MSC_SECURE_CRT +#endif //stb image #include "stb_image.h" diff --git a/src/trax-kernel/CMakeLists.txt b/src/trax-kernel/CMakeLists.txt index 38da35b..04fdaf3 100644 --- a/src/trax-kernel/CMakeLists.txt +++ b/src/trax-kernel/CMakeLists.txt @@ -2,6 +2,11 @@ cmake_minimum_required(VERSION 3.14) set(PROJECT_NAME "trax-kernel") +if(LINUX) + # enable intrinsics like _lzcnt_u64 - find this out + add_compile_options(-march=native) +endif() + file(GLOB FILES *.hpp *.cpp) add_executable(${PROJECT_NAME} ${FILES}) target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/trax-kernel) diff --git a/src/trax-kernel/custom-instr.hpp b/src/trax-kernel/custom-instr.hpp index 455ad8e..159580f 100644 --- a/src/trax-kernel/custom-instr.hpp +++ b/src/trax-kernel/custom-instr.hpp @@ -5,11 +5,22 @@ static std::atomic_uint _next_thread; #endif +// uint32_t inline fchthrd() +// { +// #ifdef __riscv +// uint32_t value = 0; +// asm volatile("fchthrd %0\n\t" : "=r" (value)); +// return value; +// #else +// return _next_thread++; +// #endif +// } + uint32_t inline fchthrd() { #ifdef __riscv uint32_t value = 0; - asm volatile("fchthrd %0\n\t" : "=r" (value)); + asm volatile(".insn i 0x000b, 0, %0, x0, 0\n\t" : "=r" (value)); return value; #else return _next_thread++; @@ -31,4 +42,24 @@ inline void ebreak() "ebreak\n\t" ); #endif -} \ No newline at end of file +} + +rtm::vec4 inline sample2d(Texture2D* texture, rtm::vec2 uv) +{ +#ifdef __riscv + uint32_t addr = (uint64_t)texture; + register float src0 asm("f0") = *(float*)&addr; + register float src1 asm("f1") = uv.x; + register float src2 asm("f2") = uv.y; + + register float dst0 asm("f28"); + register float dst1 asm("f29"); + register float dst2 asm("f30"); + register float dst3 asm("f31"); + + asm volatile(".insn i 0xb, 0x6, %0, %4, 0\n\t" : "=f" (dst0), "=f" (dst1), "=f" (dst2), "=f" (dst3) : "f" (src0), "f" (src1), "f" (src2)); + return rtm::vec4(dst0, dst1, dst2, dst3); +#else + return texture->sample(uv); +#endif +} diff --git a/src/trax-kernel/intersect.hpp b/src/trax-kernel/intersect.hpp index a3044a2..0dc1968 100644 --- a/src/trax-kernel/intersect.hpp +++ b/src/trax-kernel/intersect.hpp @@ -22,7 +22,9 @@ inline void _traceray(uint id, const rtm::Ray& ray, rtm::Hit& hit) asm volatile ( - "traceray %0, %4, %12\t\n" + // "traceray %0, %4, %12\t\n" + // ".insn i 0x0b, 0, %0, %4, %12\t\n" + ".insn i 0x0b, 5, %0, %4, %12\t\n" : "=f" (dst0), "=f" (dst1), "=f" (dst2), "=f" (dst3) : "f" (src0), "f" (src1), "f" (src2), "f" (src3), "f" (src4), "f" (src5), "f" (src6), "f" (src7), "I" (FLAGS) ); @@ -58,7 +60,13 @@ inline float _intersect(const rtm::AABB& aabb, const rtm::Ray& ray, const rtm::v register float src13 asm("f13") = aabb.max.z; float t; - asm volatile ("boxisect %0" : "=f" (t) : "f" (src0), "f" (src1), "f" (src2), "f" (src3), "f" (src4), "f" (src5), "f" (src6), "f" (src7), "f" (src8), "f" (src9), "f" (src10), "f" (src11), "f" (src12), "f" (src13)); + asm volatile ( + /*"boxisect %0"*/ + // ".insn r 0x0b %0" + ".insn r 0x0b, 0, 0x04, %0, x0, x0" + : "=f" (t) + : "f" (src0), "f" (src1), "f" (src2), "f" (src3), "f" (src4), "f" (src5), "f" (src6), "f" (src7), "f" (src8), "f" (src9), "f" (src10), "f" (src11), "f" (src12), "f" (src13) + ); return t; #else @@ -93,7 +101,12 @@ inline bool _intersect(const rtm::Triangle& tri, const rtm::Ray& ray, rtm::Hit& register float dst2 asm("f19") = hit.bc.y; register float dst3 asm("f20") = *(float*)&hit.id; - asm volatile("triisect %0\n\t" : "+f" (dst0), "+f" (dst1), "+f" (dst2), "+f" (dst3) : "f" (src0), "f" (src1), "f" (src2), "f" (src3), "f" (src4), "f" (src5), "f" (src6), "f" (src7), "f" (src8), "f" (src9), "f" (src10), "f" (src11), "f" (src12), "f" (src13), "f" (src14), "f" (src15), "f" (src16)); + asm volatile( + /*"triisect %0\n\t"*/ + // ".insn r 0x0b %0\n\t" + ".insn r 0x0b, 0, 0x08, %0, x0, x0" + : "+f" (dst0), "+f" (dst1), "+f" (dst2), "+f" (dst3) : "f" (src0), "f" (src1), "f" (src2), "f" (src3), "f" (src4), "f" (src5), "f" (src6), "f" (src7), "f" (src8), "f" (src9), "f" (src10), "f" (src11), "f" (src12), "f" (src13), "f" (src14), "f" (src15), "f" (src16) + ); bool is_hit = dst0 < hit.t; float _dst3 = dst3; diff --git a/src/trax-kernel/main.cpp b/src/trax-kernel/main.cpp index 4b00f8b..15b723a 100644 --- a/src/trax-kernel/main.cpp +++ b/src/trax-kernel/main.cpp @@ -114,18 +114,18 @@ inline static void kernel(const TRaXKernelArgs& args) rtm::uvec3 tci = args.tex_coord_indices[hit.id]; rtm::vec2 tc = args.tex_coords[tci[0]] * hit.bc[0] + args.tex_coords[tci[1]] * hit.bc[1] + args.tex_coords[tci[2]] * (1.0f - hit.bc[0] - hit.bc[1]); - rtm::vec3 albedo; + rtm::vec4 albedo; if(mat.use_am) { - albedo = mat.albedo_texture.sample(tc); + albedo = sample2d(&mat.albedo_texture, tc); } else { - albedo = rtm::vec3(1.0f, 0.0f, 1.0f); + albedo = rtm::vec4(1.0f, 0.0f, 1.0f, 1.0f); } //args.framebuffer[fb_index] = rtm::RNG::hash(mat.use_am) | 0xff000000; - args.framebuffer[fb_index] = encode_pixel(albedo); + args.framebuffer[fb_index] = encode_pixel(rtm::vec3(albedo.x, albedo.y, albedo.z)); } else { @@ -182,7 +182,7 @@ int main() } #else -#include +// #include int main(int argc, char* argv[]) { uint pregen_bounce = 0; @@ -205,6 +205,8 @@ int main(int argc, char* argv[]) args.camera = rtm::Camera(args.framebuffer_width, args.framebuffer_height, 12.0f, rtm::vec3(-900.6f, 150.8f, 120.74f), rtm::vec3(79.7f, 14.0f, -17.4f)); if(scene_name.compare("intel-sponza") == 0) args.camera = rtm::Camera(args.framebuffer_width, args.framebuffer_height, 12.0f, rtm::vec3(-900.6f, 150.8f, 120.74f), rtm::vec3(79.7f, 14.0f, -17.4f)); + if(scene_name.compare("sponza") == 0) + args.camera = rtm::Camera(args.framebuffer_width, args.framebuffer_height, 12.0f, rtm::vec3(0.0f, 2.0f, 0.0f), rtm::vec3(90.0f, 0.0f, -1.0f)); if(scene_name.compare("san-miguel") == 0) args.camera = rtm::Camera(args.framebuffer_width, args.framebuffer_height, 12.0f, rtm::vec3(7.448, 1.014, 12.357), rtm::vec3(7.448 + 0.608, 1.014 + 0.026, 12.357 - 0.794)); if(scene_name.compare("bistro") == 0) @@ -212,8 +214,8 @@ int main(int argc, char* argv[]) if(scene_name.compare("hairball") == 0) args.camera = rtm::Camera(args.framebuffer_width, args.framebuffer_height, 24.0f, rtm::vec3(0.0, 0.0, 10.0), rtm::vec3(0.0f, 0.0f, 0.0f)); - std::string mesh_path = "../../../datasets/" + scene_name + ".obj"; - std::string bvh_cache_path = "../../../datasets/cache/" + scene_name + ".bvh"; + std::string mesh_path = "datasets/" + scene_name + ".obj"; + std::string bvh_cache_path = "datasets/cache/" + scene_name + ".bvh"; rtm::Mesh mesh(mesh_path); @@ -322,7 +324,7 @@ int main(int argc, char* argv[]) args.material_indices = mesh.material_indices.data(); args.materials = mesh.materials.data(); - printf("\nStarting Taveral\n"); + printf("\nStarting Traversal\n"); auto start = std::chrono::high_resolution_clock::now(); std::vector threads; @@ -354,7 +356,7 @@ int main(int argc, char* argv[]) #endif stbi_flip_vertically_on_write(true); - stbi_write_png("./out.png", args.framebuffer_width, args.framebuffer_height, 4, args.framebuffer, 0); + stbi_write_png("out.png", args.framebuffer_width, args.framebuffer_height, 4, args.framebuffer, 0); return 0; } diff --git a/src/trax-kernel/makefile b/src/trax-kernel/makefile index 32096f1..afde1aa 100644 --- a/src/trax-kernel/makefile +++ b/src/trax-kernel/makefile @@ -2,10 +2,12 @@ make: riscv riscv-dissasm #-mno-relax riscv: ./* - riscv64-unknown-elf-g++ -mno-relax -nostartfiles -emain -Wstack-usage=512 -mabi=lp64f -Ofast -I ../../include ./main.cpp -o ./riscv/kernel + riscv64-unknown-elf-g++ -mno-relax -nostartfiles -emain -Wstack-usage=512 -mabi=lp64f -Ofast -I ../../include ./main.cpp -o ./kernel-gcc + #clang++-20 --target=riscv64-unknown-elf --gcc-toolchain=/opt/riscv --sysroot=/opt/riscv/riscv64-unknown-elf -march=rv64imaf -mabi=lp64f -mno-relax -nostartfiles -Wl,-e,main -Wframe-larger-than=512 -O3 -I ../../include ./main.cpp -o ./kernel-llvm riscv-dissasm: riscv riscv64-unknown-elf-objdump -d -x ./riscv/kernel > "./riscv/kernel.dump" + #riscv64-unknown-elf-objdump -d -x ./kernel-llvm > "./kernel-llvm.dump" clean: rm -f riscv/* \ No newline at end of file diff --git a/src/trax-kernel/riscv/kernel b/src/trax-kernel/riscv/kernel old mode 100644 new mode 100755 index d84f51d..c1beeee Binary files a/src/trax-kernel/riscv/kernel and b/src/trax-kernel/riscv/kernel differ diff --git a/src/trax-kernel/riscv/kernel.dump b/src/trax-kernel/riscv/kernel.dump index 4a6d791..654d9a2 100644 --- a/src/trax-kernel/riscv/kernel.dump +++ b/src/trax-kernel/riscv/kernel.dump @@ -6,359 +6,205 @@ EXEC_P, HAS_SYMS, D_PAGED start address 0x00000000000100e8 Program Header: -0x70000003 off 0x0000000000001646 vaddr 0x0000000000000000 paddr 0x0000000000000000 align 2**0 - filesz 0x000000000000003d memsz 0x0000000000000000 flags r-- +0x70000003 off 0x00000000000013c9 vaddr 0x0000000000000000 paddr 0x0000000000000000 align 2**0 + filesz 0x0000000000000050 memsz 0x0000000000000000 flags r-- LOAD off 0x0000000000000000 vaddr 0x0000000000010000 paddr 0x0000000000010000 align 2**12 - filesz 0x0000000000001634 memsz 0x0000000000001634 flags r-x + filesz 0x00000000000013b0 memsz 0x00000000000013b0 flags r-x STACK off 0x0000000000000000 vaddr 0x0000000000000000 paddr 0x0000000000000000 align 2**4 filesz 0x0000000000000000 memsz 0x0000000000000000 flags rw- Sections: Idx Name Size VMA LMA File off Algn - 0 .text 000004f0 00000000000100e8 00000000000100e8 000000e8 2**2 + 0 .text 00000288 00000000000100e8 00000000000100e8 000000e8 2**2 CONTENTS, ALLOC, LOAD, READONLY, CODE - 1 .eh_frame 00000040 00000000000105d8 00000000000105d8 000005d8 2**3 + 1 .eh_frame 00000028 0000000000010370 0000000000010370 00000370 2**3 CONTENTS, ALLOC, LOAD, READONLY, DATA - 2 .sdata 0000001c 0000000000011618 0000000000011618 00001618 2**2 + 2 .sdata 00000018 0000000000011398 0000000000011398 00001398 2**2 CONTENTS, ALLOC, LOAD, READONLY, DATA - 3 .comment 00000012 0000000000000000 0000000000000000 00001634 2**0 + 3 .comment 00000019 0000000000000000 0000000000000000 000013b0 2**0 CONTENTS, READONLY - 4 .riscv.attributes 0000003d 0000000000000000 0000000000000000 00001646 2**0 + 4 .riscv.attributes 00000050 0000000000000000 0000000000000000 000013c9 2**0 CONTENTS, READONLY SYMBOL TABLE: 00000000000100e8 l d .text 0000000000000000 .text -00000000000105d8 l d .eh_frame 0000000000000000 .eh_frame -0000000000011618 l d .sdata 0000000000000000 .sdata +0000000000010370 l d .eh_frame 0000000000000000 .eh_frame +0000000000011398 l d .sdata 0000000000000000 .sdata 0000000000000000 l d .comment 0000000000000000 .comment 0000000000000000 l d .riscv.attributes 0000000000000000 .riscv.attributes 0000000000000000 l df *ABS* 0000000000000000 main.cpp -0000000000011e18 g *ABS* 0000000000000000 __global_pointer$ -0000000000011618 g .sdata 0000000000000000 __SDATA_BEGIN__ -0000000000011638 g .sdata 0000000000000000 __BSS_END__ -0000000000011634 g .sdata 0000000000000000 __bss_start -00000000000100e8 g F .text 00000000000004f0 main -0000000000011618 g .sdata 0000000000000000 __DATA_BEGIN__ -0000000000011634 g .sdata 0000000000000000 _edata -0000000000011638 g .sdata 0000000000000000 _end +0000000000011b98 g *ABS* 0000000000000000 __global_pointer$ +0000000000011398 g .sdata 0000000000000000 __SDATA_BEGIN__ +00000000000113b0 g .sdata 0000000000000000 __BSS_END__ +00000000000113b0 g .sdata 0000000000000000 __bss_start +00000000000100e8 g F .text 0000000000000288 main +0000000000011398 g .sdata 0000000000000000 __DATA_BEGIN__ +00000000000113b0 g .sdata 0000000000000000 _edata +00000000000113b0 g .sdata 0000000000000000 _end Disassembly of section .text: 00000000000100e8
: - 100e8: 0000070b fchthrd a4 + 100e8: 0000070b .insn 4, 0x070b 100ec: 10802783 lw a5,264(zero) # 108 100f0: 0007071b sext.w a4,a4 - 100f4: 4cf77e63 bgeu a4,a5,105d0 - 100f8: 00100793 li a5,1 - 100fc: 00011537 lui a0,0x11 - 10100: 000115b7 lui a1,0x11 - 10104: 00011637 lui a2,0x11 - 10108: 000116b7 lui a3,0x11 - 1010c: 02079893 sll a7,a5,0x20 - 10110: 62052687 flw fa3,1568(a0) # 11620 <__DATA_BEGIN__+0x8> - 10114: 6245a707 flw fa4,1572(a1) # 11624 <__DATA_BEGIN__+0xc> - 10118: 61862587 flw fa1,1560(a2) # 11618 <__DATA_BEGIN__> - 1011c: 61c6a607 flw fa2,1564(a3) # 1161c <__DATA_BEGIN__+0x4> - 10120: 11003803 ld a6,272(zero) # 110 - 10124: 11804503 lbu a0,280(zero) # 118 - 10128: fe010113 add sp,sp,-32 - 1012c: 00f888b3 add a7,a7,a5 - 10130: 2ec0006f j 1041c - 10134: 00c16603 lwu a2,12(sp) - 10138: 1b003703 ld a4,432(zero) # 1b0 - 1013c: 1b803303 ld t1,440(zero) # 1b8 - 10140: 00261693 sll a3,a2,0x2 - 10144: 00d70733 add a4,a4,a3 - 10148: 00076583 lwu a1,0(a4) - 1014c: ffff0737 lui a4,0xffff0 - 10150: 0ff70713 add a4,a4,255 # ffffffffffff00ff <__global_pointer$+0xfffffffffffde2e7> - 10154: 00359693 sll a3,a1,0x3 - 10158: 40b686b3 sub a3,a3,a1 - 1015c: 00369693 sll a3,a3,0x3 - 10160: 00d306b3 add a3,t1,a3 - 10164: 0016c583 lbu a1,1(a3) - 10168: 0025f593 and a1,a1,2 - 1016c: 28058e63 beqz a1,10408 - 10170: 0086b583 ld a1,8(a3) - 10174: ff000737 lui a4,0xff000 - 10178: 28058863 beqz a1,10408 - 1017c: 0186b303 ld t1,24(a3) - 10180: 3b158e63 beq a1,a7,1053c - 10184: 00161593 sll a1,a2,0x1 - 10188: 19003e83 ld t4,400(zero) # 190 - 1018c: 00c58633 add a2,a1,a2 - 10190: 00261613 sll a2,a2,0x2 - 10194: 00ce8eb3 add t4,t4,a2 - 10198: 004eee03 lwu t3,4(t4) - 1019c: 1a803603 ld a2,424(zero) # 1a8 - 101a0: 01eef553 fadd.s fa0,ft9,ft10 - 101a4: 003e1e13 sll t3,t3,0x3 - 101a8: 01c60e33 add t3,a2,t3 - 101ac: 00011f37 lui t5,0x11 - 101b0: 62cf2187 flw ft3,1580(t5) # 1162c <__DATA_BEGIN__+0x14> - 101b4: 004e2787 flw fa5,4(t3) - 101b8: 008ee583 lwu a1,8(t4) - 101bc: 08a1f553 fsub.s fa0,ft3,fa0 - 101c0: 11e7f7d3 fmul.s fa5,fa5,ft10 - 101c4: 00359593 sll a1,a1,0x3 - 101c8: 00b605b3 add a1,a2,a1 - 101cc: 000eee83 lwu t4,0(t4) - 101d0: 0045a007 flw ft0,4(a1) - 101d4: 000e2087 flw ft1,0(t3) - 101d8: 003e9e13 sll t3,t4,0x3 - 101dc: 78057043 fmadd.s ft0,fa0,ft0,fa5 - 101e0: 01c60633 add a2,a2,t3 - 101e4: 00462787 flw fa5,4(a2) - 101e8: 101f7f53 fmul.s ft10,ft10,ft1 - 101ec: 0005a087 flw ft1,0(a1) - 101f0: 01d7f7c3 fmadd.s fa5,fa5,ft9,ft0 - 101f4: 00062107 flw ft2,0(a2) - 101f8: 00c6af03 lw t5,12(a3) - 101fc: f0157543 fmadd.s fa0,fa0,ft1,ft10 - 10200: 000115b7 lui a1,0x11 - 10204: c0079653 fcvt.w.s a2,fa5,rtz - 10208: 6205a207 flw ft4,1568(a1) # 11620 <__DATA_BEGIN__+0x8> - 1020c: d00f7053 fcvt.s.w ft0,t5 - 10210: d00670d3 fcvt.s.w ft1,a2 - 10214: 51d17143 fmadd.s ft2,ft2,ft9,fa0 - 10218: 0086ae83 lw t4,8(a3) - 1021c: 0817f7d3 fsub.s fa5,fa5,ft1 - 10220: 00011637 lui a2,0x11 - 10224: 62462507 flw fa0,1572(a2) # 11624 <__DATA_BEGIN__+0xc> - 10228: c00116d3 fcvt.w.s a3,ft2,rtz - 1022c: d00ef2d3 fcvt.s.w ft5,t4 - 10230: 2007f0c3 fmadd.s ft1,fa5,ft0,ft4 - 10234: d006f353 fcvt.s.w ft6,a3 - 10238: 5007f7c3 fmadd.s fa5,fa5,ft0,fa0 - 1023c: 00011637 lui a2,0x11 - 10240: 08617153 fsub.s ft2,ft2,ft6 - 10244: c01096d3 fcvt.wu.s a3,ft1,rtz - 10248: 62862007 flw ft0,1576(a2) # 11628 <__DATA_BEGIN__+0x10> - 1024c: c0179e53 fcvt.wu.s t3,fa5,rtz - 10250: 01e686bb addw a3,a3,t5 - 10254: 03e6f6bb remuw a3,a3,t5 - 10258: 50517543 fmadd.s fa0,ft2,ft5,fa0 - 1025c: 01ee0e3b addw t3,t3,t5 - 10260: 20517143 fmadd.s ft2,ft2,ft5,ft4 - 10264: c00095d3 fcvt.w.s a1,ft1,rtz - 10268: 00812e27 fsw fs0,28(sp) - 1026c: c0151653 fcvt.wu.s a2,fa0,rtz - 10270: d005f7d3 fcvt.s.w fa5,a1 - 10274: c01115d3 fcvt.wu.s a1,ft2,rtz - 10278: 01d6063b addw a2,a2,t4 - 1027c: c0011fd3 fcvt.w.s t6,ft2,rtz - 10280: 01d585bb addw a1,a1,t4 - 10284: 08f0f0d3 fsub.s ft1,ft1,fa5 - 10288: d00ff7d3 fcvt.s.w fa5,t6 - 1028c: 00011fb7 lui t6,0x11 - 10290: 630fa307 flw ft6,1584(t6) # 11630 <__DATA_BEGIN__+0x18> - 10294: 08f17153 fsub.s ft2,ft2,fa5 - 10298: f00003d3 fmv.w.x ft7,zero - 1029c: 03ee7e3b remuw t3,t3,t5 - 102a0: 03d6763b remuw a2,a2,t4 - 102a4: 03d5f5bb remuw a1,a1,t4 - 102a8: 03d686bb mulw a3,a3,t4 - 102ac: 03de0e3b mulw t3,t3,t4 - 102b0: 00c68ebb addw t4,a3,a2 - 102b4: 020e9f13 sll t5,t4,0x20 - 102b8: 01ef5e93 srl t4,t5,0x1e - 102bc: 01d30eb3 add t4,t1,t4 - 102c0: 002ecf03 lbu t5,2(t4) - 102c4: 001ecf83 lbu t6,1(t4) - 102c8: 000ece83 lbu t4,0(t4) - 102cc: d01f7553 fcvt.s.wu fa0,t5 - 102d0: d01ff853 fcvt.s.wu fa6,t6 - 102d4: 01c6063b addw a2,a2,t3 - 102d8: d01ef2d3 fcvt.s.wu ft5,t4 - 102dc: 02061e93 sll t4,a2,0x20 - 102e0: 01eed613 srl a2,t4,0x1e - 102e4: 00c30633 add a2,t1,a2 - 102e8: 00264e83 lbu t4,2(a2) - 102ec: 00164f03 lbu t5,1(a2) - 102f0: 10087853 fmul.s fa6,fa6,ft0 - 102f4: 10057553 fmul.s fa0,fa0,ft0 - 102f8: 00064603 lbu a2,0(a2) - 102fc: d01effd3 fcvt.s.wu ft11,t4 - 10300: 1002f2d3 fmul.s ft5,ft5,ft0 - 10304: d01f7ed3 fcvt.s.wu ft9,t5 - 10308: d01678d3 fcvt.s.wu fa7,a2 - 1030c: 500ff54b fnmsub.s fa0,ft11,ft0,fa0 - 10310: 800ef84b fnmsub.s fa6,ft9,ft0,fa6 - 10314: 00b686bb addw a3,a3,a1 - 10318: 02069613 sll a2,a3,0x20 - 1031c: 2808f2cb fnmsub.s ft5,fa7,ft0,ft5 - 10320: 01e65693 srl a3,a2,0x1e - 10324: 00d306b3 add a3,t1,a3 - 10328: 0026c603 lbu a2,2(a3) - 1032c: 0016ce83 lbu t4,1(a3) - 10330: 1100f853 fmul.s fa6,ft1,fa6 - 10334: 10a0f553 fmul.s fa0,ft1,fa0 - 10338: 01c585bb addw a1,a1,t3 - 1033c: 0006ce03 lbu t3,0(a3) - 10340: 02059693 sll a3,a1,0x20 - 10344: 1050f2d3 fmul.s ft5,ft1,ft5 - 10348: d01677d3 fcvt.s.wu fa5,a2 - 1034c: 01e6d593 srl a1,a3,0x1e - 10350: d01eff53 fcvt.s.wu ft10,t4 - 10354: 00b306b3 add a3,t1,a1 - 10358: 0016c583 lbu a1,1(a3) - 1035c: 0026c603 lbu a2,2(a3) - 10360: d01e7e53 fcvt.s.wu ft8,t3 - 10364: 800efec3 fmadd.s ft9,ft9,ft0,fa6 - 10368: 500fffc3 fmadd.s ft11,ft11,ft0,fa0 - 1036c: 100f7f53 fmul.s ft10,ft10,ft0 - 10370: 1007f7d3 fmul.s fa5,fa5,ft0 - 10374: 0006c683 lbu a3,0(a3) - 10378: 100e7853 fmul.s fa6,ft8,ft0 - 1037c: 2808f2c3 fmadd.s ft5,fa7,ft0,ft5 - 10380: d015f553 fcvt.s.wu fa0,a1 - 10384: d0167e53 fcvt.s.wu ft8,a2 - 10388: d016f8d3 fcvt.s.wu fa7,a3 - 1038c: e8057447 fmsub.s fs0,fa0,ft0,ft9 - 10390: 780e77cb fnmsub.s fa5,ft8,ft0,fa5 - 10394: f005754b fnmsub.s fa0,fa0,ft0,ft10 - 10398: f80e7e47 fmsub.s ft8,ft8,ft0,ft11 - 1039c: 8008f84b fnmsub.s fa6,fa7,ft0,fa6 - 103a0: 2808f8c7 fmsub.s fa7,fa7,ft0,ft5 - 103a4: 40a0f543 fmadd.s fa0,ft1,fa0,fs0 - 103a8: e0f0f7c3 fmadd.s fa5,ft1,fa5,ft8 - 103ac: 01c12407 flw fs0,28(sp) - 103b0: 8900f0c3 fmadd.s ft1,ft1,fa6,fa7 - 103b4: e8a17543 fmadd.s fa0,ft2,fa0,ft9 - 103b8: f8f177c3 fmadd.s fa5,ft2,fa5,ft11 - 103bc: 28117043 fmadd.s ft0,ft2,ft1,ft5 - 103c0: 28751553 fmax.s fa0,fa0,ft7 - 103c4: 287797d3 fmax.s fa5,fa5,ft7 - 103c8: 28701053 fmax.s ft0,ft0,ft7 - 103cc: 28350553 fmin.s fa0,fa0,ft3 - 103d0: 283787d3 fmin.s fa5,fa5,ft3 - 103d4: 28300053 fmin.s ft0,ft0,ft3 - 103d8: 20657543 fmadd.s fa0,fa0,ft6,ft4 - 103dc: 2067f7c3 fmadd.s fa5,fa5,ft6,ft4 - 103e0: 20607043 fmadd.s ft0,ft0,ft6,ft4 - 103e4: c01516d3 fcvt.wu.s a3,fa0,rtz - 103e8: c0179653 fcvt.wu.s a2,fa5,rtz - 103ec: c01015d3 fcvt.wu.s a1,ft0,rtz - 103f0: 0086969b sllw a3,a3,0x8 - 103f4: 0106161b sllw a2,a2,0x10 - 103f8: 00c6e6b3 or a3,a3,a2 - 103fc: 00b6e6b3 or a3,a3,a1 - 10400: 00e6e733 or a4,a3,a4 - 10404: 0007071b sext.w a4,a4 - 10408: 00e7a023 sw a4,0(a5) - 1040c: 0000070b fchthrd a4 - 10410: 10802783 lw a5,264(zero) # 108 - 10414: 0007071b sext.w a4,a4 - 10418: 0af77263 bgeu a4,a5,104bc - 1041c: 10002783 lw a5,256(zero) # 100 - 10420: 0057569b srlw a3,a4,0x5 - 10424: 0027559b srlw a1,a4,0x2 - 10428: 0027d31b srlw t1,a5,0x2 - 1042c: 0266d63b divuw a2,a3,t1 - 10430: 0075f593 and a1,a1,7 - 10434: 00377713 and a4,a4,3 - 10438: 0266f6bb remuw a3,a3,t1 - 1043c: 0036161b sllw a2,a2,0x3 - 10440: 00b6063b addw a2,a2,a1 - 10444: 02c787bb mulw a5,a5,a2 - 10448: 0026969b sllw a3,a3,0x2 - 1044c: 00e686bb addw a3,a3,a4 - 10450: 00d787bb addw a5,a5,a3 - 10454: 02079793 sll a5,a5,0x20 - 10458: 0207d793 srl a5,a5,0x20 - 1045c: 06050663 beqz a0,104c8 - 10460: 16803703 ld a4,360(zero) # 168 - 10464: 00579693 sll a3,a5,0x5 - 10468: 00d70733 add a4,a4,a3 - 1046c: 00072007 flw ft0,0(a4) # ffffffffff000000 <__global_pointer$+0xfffffffffefee1e8> - 10470: 00472087 flw ft1,4(a4) - 10474: 00872107 flw ft2,8(a4) - 10478: 00c72187 flw ft3,12(a4) - 1047c: 01072207 flw ft4,16(a4) - 10480: 01472287 flw ft5,20(a4) - 10484: 01872307 flw ft6,24(a4) - 10488: 01c72387 flw ft7,28(a4) - 1048c: 00005e0b traceray ft8,ft0,0 - 10490: a1c38753 fle.s a4,ft7,ft8 - 10494: 00279793 sll a5,a5,0x2 - 10498: 01f12627 fsw ft11,12(sp) - 1049c: 00f807b3 add a5,a6,a5 - 104a0: c8070ae3 beqz a4,10134 - 104a4: ff000737 lui a4,0xff000 - 104a8: 00e7a023 sw a4,0(a5) - 104ac: 0000070b fchthrd a4 - 104b0: 10802783 lw a5,264(zero) # 108 - 104b4: 0007071b sext.w a4,a4 - 104b8: f6f762e3 bltu a4,a5,1041c - 104bc: 00000513 li a0,0 - 104c0: 02010113 add sp,sp,32 - 104c4: 00008067 ret - 104c8: d006f7d3 fcvt.s.w fa5,a3 - 104cc: 12002507 flw fa0,288(zero) # 120 - 104d0: d0067353 fcvt.s.w ft6,a2 - 104d4: 00d7f7d3 fadd.s fa5,fa5,fa3 - 104d8: 14c02287 flw ft5,332(zero) # 14c - 104dc: 00d37353 fadd.s ft6,ft6,fa3 - 104e0: 12402207 flw ft4,292(zero) # 124 - 104e4: 15002187 flw ft3,336(zero) # 150 - 104e8: 70a7f7c3 fmadd.s fa5,fa5,fa0,fa4 - 104ec: 13c02107 flw ft2,316(zero) # 13c - 104f0: 15402087 flw ft1,340(zero) # 154 - 104f4: 13402007 flw ft0,308(zero) # 134 - 104f8: 13802507 flw fa0,312(zero) # 138 - 104fc: 70437343 fmadd.s ft6,ft6,ft4,fa4 - 10500: 2807f047 fmsub.s ft0,fa5,ft0,ft5 - 10504: 18a7f547 fmsub.s fa0,fa5,fa0,ft3 - 10508: 0827f7c7 fmsub.s fa5,fa5,ft2,ft1 - 1050c: 14002207 flw ft4,320(zero) # 140 - 10510: 14802087 flw ft1,328(zero) # 148 - 10514: 14402287 flw ft5,324(zero) # 144 - 10518: 00437243 fmadd.s ft4,ft6,ft4,ft0 - 1051c: 13002107 flw ft2,304(zero) # 130 - 10520: 505372c3 fmadd.s ft5,ft6,ft5,fa0 - 10524: 12802007 flw ft0,296(zero) # 128 - 10528: 78137343 fmadd.s ft6,ft6,ft1,fa5 - 1052c: 20b583d3 fmv.s ft7,fa1 - 10530: 12c02087 flw ft1,300(zero) # 12c - 10534: 20c601d3 fmv.s ft3,fa2 - 10538: f55ff06f j 1048c - 1053c: 00234603 lbu a2,2(t1) - 10540: 00134e03 lbu t3,1(t1) - 10544: 00034683 lbu a3,0(t1) - 10548: 000115b7 lui a1,0x11 - 1054c: 6285a107 flw ft2,1576(a1) # 11628 <__DATA_BEGIN__+0x10> - 10550: d0167553 fcvt.s.wu fa0,a2 - 10554: d01e7053 fcvt.s.wu ft0,t3 - 10558: d016f7d3 fcvt.s.wu fa5,a3 - 1055c: 10257553 fmul.s fa0,fa0,ft2 - 10560: 10207053 fmul.s ft0,ft0,ft2 - 10564: f00000d3 fmv.w.x ft1,zero - 10568: 1027f7d3 fmul.s fa5,fa5,ft2 - 1056c: 000116b7 lui a3,0x11 - 10570: 62c6a187 flw ft3,1580(a3) # 1162c <__DATA_BEGIN__+0x14> - 10574: 28101053 fmax.s ft0,ft0,ft1 - 10578: 28151553 fmax.s fa0,fa0,ft1 - 1057c: 281797d3 fmax.s fa5,fa5,ft1 - 10580: 00011637 lui a2,0x11 - 10584: 000116b7 lui a3,0x11 - 10588: 63062107 flw ft2,1584(a2) # 11630 <__DATA_BEGIN__+0x18> - 1058c: 6206a087 flw ft1,1568(a3) # 11620 <__DATA_BEGIN__+0x8> - 10590: 28300053 fmin.s ft0,ft0,ft3 - 10594: 28350553 fmin.s fa0,fa0,ft3 - 10598: 283787d3 fmin.s fa5,fa5,ft3 - 1059c: 08207043 fmadd.s ft0,ft0,ft2,ft1 - 105a0: 08257543 fmadd.s fa0,fa0,ft2,ft1 - 105a4: 0827f7c3 fmadd.s fa5,fa5,ft2,ft1 - 105a8: c01016d3 fcvt.wu.s a3,ft0,rtz - 105ac: c0151653 fcvt.wu.s a2,fa0,rtz - 105b0: c01795d3 fcvt.wu.s a1,fa5,rtz - 105b4: 0086969b sllw a3,a3,0x8 - 105b8: 0106161b sllw a2,a2,0x10 - 105bc: 00c6e6b3 or a3,a3,a2 - 105c0: 00b6e6b3 or a3,a3,a1 - 105c4: 00e6e733 or a4,a3,a4 - 105c8: 0007071b sext.w a4,a4 - 105cc: e3dff06f j 10408 - 105d0: 00000513 li a0,0 - 105d4: 00008067 ret + 100f4: 20f77063 bgeu a4,a5,102f4 + 100f8: 000117b7 lui a5,0x11 + 100fc: 3a07a707 flw fa4,928(a5) # 113a0 <__DATA_BEGIN__+0x8> + 10100: 000117b7 lui a5,0x11 + 10104: 3a47a507 flw fa0,932(a5) # 113a4 <__DATA_BEGIN__+0xc> + 10108: 000117b7 lui a5,0x11 + 1010c: 3987a887 flw fa7,920(a5) # 11398 <__DATA_BEGIN__> + 10110: 000117b7 lui a5,0x11 + 10114: 39c7a807 flw fa6,924(a5) # 1139c <__DATA_BEGIN__+0x4> + 10118: 000117b7 lui a5,0x11 + 1011c: 3a87a687 flw fa3,936(a5) # 113a8 <__DATA_BEGIN__+0x10> + 10120: 000117b7 lui a5,0x11 + 10124: 3ac7a587 flw fa1,940(a5) # 113ac <__DATA_BEGIN__+0x14> + 10128: 11003803 ld a6,272(zero) # 110 + 1012c: 11804503 lbu a0,280(zero) # 118 + 10130: f0000653 fmv.w.x fa2,zero + 10134: ff0008b7 lui a7,0xff000 + 10138: 1200006f j 10258 + 1013c: 02071613 slli a2,a4,0x20 + 10140: 1b003703 ld a4,432(zero) # 1b0 + 10144: 02065613 srli a2,a2,0x20 + 10148: 00261693 slli a3,a2,0x2 + 1014c: 00d70733 add a4,a4,a3 + 10150: 00076303 lwu t1,0(a4) + 10154: 1b803583 ld a1,440(zero) # 1b8 + 10158: ffff0737 lui a4,0xffff0 + 1015c: 00331693 slli a3,t1,0x3 + 10160: 006686b3 add a3,a3,t1 + 10164: 00369693 slli a3,a3,0x3 + 10168: 00d586b3 add a3,a1,a3 + 1016c: 0016c583 lbu a1,1(a3) + 10170: 0ff70713 addi a4,a4,255 # ffffffffffff00ff <__global_pointer$+0xfffffffffffde567> + 10174: 0025f593 andi a1,a1,2 + 10178: 0c058463 beqz a1,10240 + 1017c: 19003303 ld t1,400(zero) # 190 + 10180: 00161713 slli a4,a2,0x1 + 10184: 00c70633 add a2,a4,a2 + 10188: 00261613 slli a2,a2,0x2 + 1018c: 00c30333 add t1,t1,a2 + 10190: 00436583 lwu a1,4(t1) + 10194: 1a803703 ld a4,424(zero) # 1a8 + 10198: 00036603 lwu a2,0(t1) + 1019c: 00359593 slli a1,a1,0x3 + 101a0: 00b705b3 add a1,a4,a1 + 101a4: 0005a087 flw ft1,0(a1) + 101a8: 0045a007 flw ft0,4(a1) + 101ac: 01eef7d3 fadd.s fa5,ft9,ft10 + 101b0: 00361613 slli a2,a2,0x3 + 101b4: 100f7053 fmul.s ft0,ft10,ft0 + 101b8: 101f7f53 fmul.s ft10,ft10,ft1 + 101bc: 00c70633 add a2,a4,a2 + 101c0: 00462107 flw ft2,4(a2) + 101c4: 00062087 flw ft1,0(a2) + 101c8: 00836583 lwu a1,8(t1) + 101cc: 01d17143 fmadd.s ft2,ft2,ft9,ft0 + 101d0: 08f6f7d3 fsub.s fa5,fa3,fa5 + 101d4: f1d0f0c3 fmadd.s ft1,ft1,ft9,ft10 + 101d8: 00359613 slli a2,a1,0x3 + 101dc: 00c70733 add a4,a4,a2 + 101e0: 00472207 flw ft4,4(a4) + 101e4: 00072187 flw ft3,0(a4) + 101e8: 0086869b addiw a3,a3,8 + 101ec: f0068053 fmv.w.x ft0,a3 + 101f0: 1047f143 fmadd.s ft2,fa5,ft4,ft2 + 101f4: 0837f0c3 fmadd.s ft1,fa5,ft3,ft1 + 101f8: 00006e0b .insn 4, 0x6e0b + 101fc: 28ce9ed3 fmax.s ft9,ft9,fa2 + 10200: 28cf1f53 fmax.s ft10,ft10,fa2 + 10204: 28ce1e53 fmax.s ft8,ft8,fa2 + 10208: 28de8ed3 fmin.s ft9,ft9,fa3 + 1020c: 28df0f53 fmin.s ft10,ft10,fa3 + 10210: 28de0e53 fmin.s ft8,ft8,fa3 + 10214: 70befec3 fmadd.s ft9,ft9,fa1,fa4 + 10218: 70bf7f43 fmadd.s ft10,ft10,fa1,fa4 + 1021c: 70be7e43 fmadd.s ft8,ft8,fa1,fa4 + 10220: c01e96d3 fcvt.wu.s a3,ft9,rtz + 10224: c01f1653 fcvt.wu.s a2,ft10,rtz + 10228: c01e1753 fcvt.wu.s a4,ft8,rtz + 1022c: 0086969b slliw a3,a3,0x8 + 10230: 0106161b slliw a2,a2,0x10 + 10234: 00c6e6b3 or a3,a3,a2 + 10238: 00d76733 or a4,a4,a3 + 1023c: 01176733 or a4,a4,a7 + 10240: 00f807b3 add a5,a6,a5 + 10244: 00e7a023 sw a4,0(a5) + 10248: 0000070b .insn 4, 0x070b + 1024c: 10802783 lw a5,264(zero) # 108 + 10250: 0007071b sext.w a4,a4 + 10254: 0af77063 bgeu a4,a5,102f4 + 10258: 10002783 lw a5,256(zero) # 100 + 1025c: 0057569b srliw a3,a4,0x5 + 10260: 03b71593 slli a1,a4,0x3b + 10264: 0027d31b srliw t1,a5,0x2 + 10268: 03d5d613 srli a2,a1,0x3d + 1026c: 0266d5bb divuw a1,a3,t1 + 10270: 00377713 andi a4,a4,3 + 10274: 0266f6bb remuw a3,a3,t1 + 10278: 0035959b slliw a1,a1,0x3 + 1027c: 00b60633 add a2,a2,a1 + 10280: 02c787bb mulw a5,a5,a2 + 10284: 0026969b slliw a3,a3,0x2 + 10288: 00e686bb addw a3,a3,a4 + 1028c: 00d787bb addw a5,a5,a3 + 10290: 02079793 slli a5,a5,0x20 + 10294: 0207d793 srli a5,a5,0x20 + 10298: 06050263 beqz a0,102fc + 1029c: 16803703 ld a4,360(zero) # 168 + 102a0: 00579693 slli a3,a5,0x5 + 102a4: 00d70733 add a4,a4,a3 + 102a8: 00072007 flw ft0,0(a4) + 102ac: 00472087 flw ft1,4(a4) + 102b0: 00872107 flw ft2,8(a4) + 102b4: 00c72187 flw ft3,12(a4) + 102b8: 01072207 flw ft4,16(a4) + 102bc: 01472287 flw ft5,20(a4) + 102c0: 01872307 flw ft6,24(a4) + 102c4: 01c72387 flw ft7,28(a4) + 102c8: 00005e0b .insn 4, 0x5e0b + 102cc: a1c386d3 fle.s a3,ft7,ft8 + 102d0: e00f8753 fmv.x.w a4,ft11 + 102d4: 00279793 slli a5,a5,0x2 + 102d8: e60682e3 beqz a3,1013c + 102dc: 00f807b3 add a5,a6,a5 + 102e0: 0117a023 sw a7,0(a5) + 102e4: 0000070b .insn 4, 0x070b + 102e8: 10802783 lw a5,264(zero) # 108 + 102ec: 0007071b sext.w a4,a4 + 102f0: f6f764e3 bltu a4,a5,10258 + 102f4: 00000513 li a0,0 + 102f8: 00008067 ret + 102fc: d006f7d3 fcvt.s.w fa5,a3 + 10300: d0067353 fcvt.s.w ft6,a2 + 10304: 12002087 flw ft1,288(zero) # 120 + 10308: 00e7f7d3 fadd.s fa5,fa5,fa4 + 1030c: 00e37353 fadd.s ft6,ft6,fa4 + 10310: 12402107 flw ft2,292(zero) # 124 + 10314: 14c02207 flw ft4,332(zero) # 14c + 10318: 13c02187 flw ft3,316(zero) # 13c + 1031c: 5017f7c3 fmadd.s fa5,fa5,ft1,fa0 + 10320: 50237343 fmadd.s ft6,ft6,ft2,fa0 + 10324: 13802087 flw ft1,312(zero) # 138 + 10328: 15002107 flw ft2,336(zero) # 150 + 1032c: 13402007 flw ft0,308(zero) # 134 + 10330: 14402287 flw ft5,324(zero) # 144 + 10334: 1017f0c7 fmsub.s ft1,fa5,ft1,ft2 + 10338: 15402107 flw ft2,340(zero) # 154 + 1033c: 2007f047 fmsub.s ft0,fa5,ft0,ft4 + 10340: 14002207 flw ft4,320(zero) # 140 + 10344: 1037f7c7 fmsub.s fa5,fa5,ft3,ft2 + 10348: 14802107 flw ft2,328(zero) # 148 + 1034c: 085372c3 fmadd.s ft5,ft6,ft5,ft1 + 10350: 00437243 fmadd.s ft4,ft6,ft4,ft0 + 10354: 12c02087 flw ft1,300(zero) # 12c + 10358: 78237343 fmadd.s ft6,ft6,ft2,fa5 + 1035c: 12802007 flw ft0,296(zero) # 128 + 10360: 13002107 flw ft2,304(zero) # 130 + 10364: 211883d3 fmv.s ft7,fa7 + 10368: 210801d3 fmv.s ft3,fa6 + 1036c: f5dff06f j 102c8