From 673f3e5a5c895c465f38357f0b568589c7101ec7 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Sun, 5 Oct 2025 14:38:34 +0200 Subject: [PATCH 01/12] Support any number of threads --- include/builder/parallel_sort.hpp | 4 +++- src/build.cpp | 3 --- tools/build.cpp | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/builder/parallel_sort.hpp b/include/builder/parallel_sort.hpp index 66668e0..a12848a 100644 --- a/include/builder/parallel_sort.hpp +++ b/include/builder/parallel_sort.hpp @@ -65,7 +65,6 @@ void parallel_sort(std::vector& data, const uint64_t num_threads, Compare com return; } - assert((num_threads & (num_threads - 1)) == 0); const uint64_t data_size = data.size(); const uint64_t chunk_size = (data_size + num_threads - 1) / num_threads; const uint64_t sequential_merge_threshold = data_size / uint64_t(std::log2(num_threads)); @@ -114,6 +113,9 @@ void parallel_sort(std::vector& data, const uint64_t num_threads, Compare com auto merged_begin = output_iterator; auto merged_end = merged_begin + output_size; next_ranges.push_back({merged_begin, merged_end}); + } else { + // Odd range out: carry it forward to the next iteration + next_ranges.push_back(ranges[i]); } } ranges.swap(next_ranges); diff --git a/src/build.cpp b/src/build.cpp index 759326b..15ab344 100644 --- a/src/build.cpp +++ b/src/build.cpp @@ -28,9 +28,6 @@ void dictionary::build(std::string const& filename, if (build_config.l >= constants::max_l) { throw std::runtime_error("l must be < " + std::to_string(constants::max_l)); } - if ((build_config.num_threads & (build_config.num_threads - 1)) != 0) { - throw std::runtime_error("number of threads must be a power of 2"); - } m_k = build_config.k; m_m = build_config.m; diff --git a/tools/build.cpp b/tools/build.cpp index cb38f14..9c9d738 100644 --- a/tools/build.cpp +++ b/tools/build.cpp @@ -21,7 +21,7 @@ int build(int argc, char** argv) { parser.add("seed", "Seed for construction (default is " + std::to_string(constants::seed) + ").", "-s", false); - parser.add("t", "Number of threads (default is 1). Must be a power of 2.", "-t", false); + parser.add("t", "Number of threads (default is 1).", "-t", false); parser.add("l", "A (integer) constant that controls the space/time trade-off of the dictionary. " "A reasonable values lies in [2.." + From 9a6ae807979a9de848863d5f6cd23be76aef388a Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Sun, 5 Oct 2025 15:32:58 +0200 Subject: [PATCH 02/12] Fix parallel_sort.hpp --- include/builder/parallel_sort.hpp | 36 +++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/include/builder/parallel_sort.hpp b/include/builder/parallel_sort.hpp index a12848a..c1f531e 100644 --- a/include/builder/parallel_sort.hpp +++ b/include/builder/parallel_sort.hpp @@ -55,17 +55,18 @@ void parallel_merge(Iterator A_begin, Iterator A_end, // working memory, which must be of same size as data. */ template -void parallel_sort(std::vector& data, const uint64_t num_threads, Compare comp) // +void parallel_sort(std::vector& data, uint64_t num_threads, Compare comp) // { std::vector temp_data; temp_data.resize(data.size()); + const uint64_t data_size = data.size(); + num_threads = std::min(num_threads, data_size); if (num_threads <= 1) { std::sort(data.begin(), data.end(), comp); return; } - const uint64_t data_size = data.size(); const uint64_t chunk_size = (data_size + num_threads - 1) / num_threads; const uint64_t sequential_merge_threshold = data_size / uint64_t(std::log2(num_threads)); assert(sequential_merge_threshold > 0); @@ -93,30 +94,29 @@ void parallel_sort(std::vector& data, const uint64_t num_threads, Compare com while (ranges.size() != 1) // { next_ranges.clear(); - for (uint64_t i = 0; i != ranges.size(); i += 2) { + for (uint64_t i = 0; i < ranges.size(); i += 2) { + auto [begin1, end1] = ranges[i]; + + auto input = data.begin(); + auto output = temp_data.begin(); + if (swap) std::swap(input, output); + uint64_t offset = std::distance(input, begin1); + auto output_iterator = output + offset; + assert(offset <= data_size); + uint64_t output_size = end1 - begin1; if (i + 1 < ranges.size()) { - auto [begin1, end1] = ranges[i]; auto [begin2, end2] = ranges[i + 1]; - uint64_t output_size = (end1 - begin1) + (end2 - begin2); - - auto input = data.begin(); - auto output = temp_data.begin(); - if (swap) std::swap(input, output); - uint64_t offset = std::distance(input, begin1); - auto output_iterator = output + offset; - assert(offset <= data_size); + output_size += end2 - begin2; parallel_merge(begin1, end1, begin2, end2, output_iterator, comp, sequential_merge_threshold); assert(std::is_sorted(output_iterator, output_iterator + output_size, comp)); - - auto merged_begin = output_iterator; - auto merged_end = merged_begin + output_size; - next_ranges.push_back({merged_begin, merged_end}); } else { - // Odd range out: carry it forward to the next iteration - next_ranges.push_back(ranges[i]); + std::move(begin1, end1, output_iterator); } + auto merged_begin = output_iterator; + auto merged_end = merged_begin + output_size; + next_ranges.push_back({merged_begin, merged_end}); } ranges.swap(next_ranges); swap = !swap; From 1218e83b7470ded9c2bd1d8dd719e70582c67813 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Sun, 5 Oct 2025 19:03:10 +0200 Subject: [PATCH 03/12] Multithreading fixes --- include/builder/build_sparse_index.hpp | 15 ++++++++------- include/builder/parallel_sort.hpp | 9 +++++---- include/builder/parse_file.hpp | 4 ++-- src/build.cpp | 6 +++--- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/include/builder/build_sparse_index.hpp b/include/builder/build_sparse_index.hpp index 5db9084..fe9ccdb 100644 --- a/include/builder/build_sparse_index.hpp +++ b/include/builder/build_sparse_index.hpp @@ -82,9 +82,9 @@ buckets_statistics build_sparse_index(parse_data& data, buckets& } } offsets.push_back(num_super_kmers); - assert(offsets.size() == num_threads + 1); - std::vector threads_buckets_stats(num_threads); + std::vector threads_buckets_stats; + threads_buckets_stats.reserve(num_threads); auto exe = [&](const uint64_t thread_id) { assert(thread_id + 1 < offsets.size()); @@ -115,11 +115,12 @@ buckets_statistics build_sparse_index(parse_data& data, buckets& } }; - std::vector threads(num_threads); - for (uint64_t thread_id = 0; thread_id != num_threads; ++thread_id) { - threads_buckets_stats[thread_id] = - buckets_statistics(num_buckets, num_kmers, num_minimizer_positions); - threads[thread_id] = std::thread(exe, thread_id); + std::vector threads; + threads.reserve(num_threads); + assert(offsets.size() <= num_threads + 1); + for (uint64_t thread_id = 0; thread_id + 1 < size(offsets); ++thread_id) { + threads_buckets_stats.emplace_back(num_buckets, num_kmers, num_minimizer_positions); + threads.emplace_back(exe, thread_id); } for (auto& t : threads) { if (t.joinable()) t.join(); diff --git a/include/builder/parallel_sort.hpp b/include/builder/parallel_sort.hpp index c1f531e..9b6c63c 100644 --- a/include/builder/parallel_sort.hpp +++ b/include/builder/parallel_sort.hpp @@ -75,12 +75,13 @@ void parallel_sort(std::vector& data, uint64_t num_threads, Compare comp) // threads.reserve(num_threads); using iterator_t = typename std::vector::iterator; - std::vector> ranges(num_threads); + std::vector> ranges; + ranges.reserve(num_threads); - for (uint64_t i = 0; i != num_threads; ++i) { + for (uint64_t i = 0; i * chunk_size < data_size; ++i) { uint64_t begin = i * chunk_size; - uint64_t end = (i == num_threads - 1) ? data_size : begin + chunk_size; - ranges[i] = {data.begin() + begin, data.begin() + end}; + uint64_t end = std::min(data_size, begin + chunk_size); + ranges.emplace_back(data.begin() + begin, data.begin() + end); threads.emplace_back( [&, begin, end]() { std::sort(data.begin() + begin, data.begin() + end, comp); }); } diff --git a/include/builder/parse_file.hpp b/include/builder/parse_file.hpp index 1e71bb0..f269788 100644 --- a/include/builder/parse_file.hpp +++ b/include/builder/parse_file.hpp @@ -214,7 +214,7 @@ void parse_file(std::istream& is, parse_data& data, std::vector threads; threads.reserve(num_threads); - for (uint64_t t = 0; t != num_threads; ++t) // + for (uint64_t t = 0; t * num_sequences_per_thread < num_sequences; ++t) // { threads.emplace_back([&, t] { std::vector buffer; @@ -249,7 +249,7 @@ void parse_file(std::istream& is, parse_data& data, minimizer_iterator minimizer_it(k, m, hasher); minimizer_iterator_rc minimizer_it_rc(k, m, hasher); - for (uint64_t i = index_begin; i != index_end; ++i) // + for (uint64_t i = index_begin; i < index_end; ++i) // { const uint64_t begin = data.pieces[i]; const uint64_t end = data.pieces[i + 1]; diff --git a/src/build.cpp b/src/build.cpp index 15ab344..117f2d7 100644 --- a/src/build.cpp +++ b/src/build.cpp @@ -123,11 +123,11 @@ void dictionary::build(std::string const& filename, input.read(reinterpret_cast(buffer.data()), buffer.size() * sizeof(minimizer_tuple)); const uint64_t chunk_size = (n + num_threads - 1) / num_threads; - for (uint64_t t = 0; t != num_threads; ++t) { + for (uint64_t t = 0; t * chunk_size < n; ++t) { uint64_t begin = t * chunk_size; - uint64_t end = (t == num_threads - 1) ? n : begin + chunk_size; + uint64_t end = std::min(n, begin + chunk_size); threads.emplace_back([begin, end, &buffer, &f]() { - for (uint64_t i = begin; i != end; ++i) { + for (uint64_t i = begin; i < end; ++i) { buffer[i].minimizer = f.lookup(buffer[i].minimizer); } }); From 4b3781d3a095e384f2357d2458d7aeff28ba52fe Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Mon, 6 Oct 2025 00:11:39 +0200 Subject: [PATCH 04/12] Update pthash --- external/pthash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/pthash b/external/pthash index 725846c..c6f11b1 160000 --- a/external/pthash +++ b/external/pthash @@ -1 +1 @@ -Subproject commit 725846c18709efb4d2b55170c70ea174d73d62dc +Subproject commit c6f11b15816f9c6fb0896ed1936495d53fa24a7b From 6457e5d45e3b2120b898d3a5e31586ffe4211d71 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Mon, 6 Oct 2025 00:17:38 +0200 Subject: [PATCH 05/12] update pthash --- external/pthash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/pthash b/external/pthash index c6f11b1..53b682e 160000 --- a/external/pthash +++ b/external/pthash @@ -1 +1 @@ -Subproject commit c6f11b15816f9c6fb0896ed1936495d53fa24a7b +Subproject commit 53b682e1396936040fb15489b6307a3a94a27093 From 7f4f9a580c739c78a4403394f89c2b6289182e8a Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Mon, 6 Oct 2025 17:26:33 +0200 Subject: [PATCH 06/12] Try safe-guarding offsets_builder.set with mutex --- include/builder/build_sparse_index.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/builder/build_sparse_index.hpp b/include/builder/build_sparse_index.hpp index fe9ccdb..17beff9 100644 --- a/include/builder/build_sparse_index.hpp +++ b/include/builder/build_sparse_index.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include "include/buckets_statistics.hpp" namespace sshash { @@ -86,6 +87,11 @@ buckets_statistics build_sparse_index(parse_data& data, buckets& std::vector threads_buckets_stats; threads_buckets_stats.reserve(num_threads); + // Mutex to protect concurrent writes to offsets_builder + // Multiple threads can write to the same bucket positions, and compact_vector + // packs values into 64-bit words, so nearby writes can race on the same word + std::mutex offsets_mutex; + auto exe = [&](const uint64_t thread_id) { assert(thread_id + 1 < offsets.size()); const uint64_t offset_begin = offsets[thread_id]; @@ -106,6 +112,9 @@ buckets_statistics build_sparse_index(parse_data& data, buckets& uint64_t prev_pos_in_seq = constants::invalid_uint64; for (auto mt : bucket) { if (mt.pos_in_seq != prev_pos_in_seq) { + // Protect compact_vector writes - multiple threads can write to + // positions that share the same 64-bit word + std::lock_guard lock(offsets_mutex); offsets_builder.set(begin + pos++, mt.pos_in_seq); prev_pos_in_seq = mt.pos_in_seq; } From 20d7d81639e37c24df4f1f5d4c5fc91e3f44ef36 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Mon, 6 Oct 2025 18:57:33 +0200 Subject: [PATCH 07/12] fix kmer_t processing for uint_kmer_bits > 64 --- include/builder/parse_file.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/builder/parse_file.hpp b/include/builder/parse_file.hpp index f269788..ef13e01 100644 --- a/include/builder/parse_file.hpp +++ b/include/builder/parse_file.hpp @@ -173,7 +173,10 @@ void parse_file(std::istream& is, parse_data& data, /* Push a final sentinel (dummy) value to avoid bounds' checking in kmer_iterator::fill_buff(). */ - bvb_strings.append_bits(0, kmer_t::uint_kmer_bits); + for (int i = 0; i < kmer_t::uint_kmer_bits / 64; i++) { + bvb_strings.append_bits(0, 64); + } + bvb_strings.append_bits(0, kmer_t::uint_kmer_bits % 64); bvb_strings.build(data.strings); From c35f3040e40dd6805885605883bed43f7cb558ca Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Mon, 6 Oct 2025 18:57:33 +0200 Subject: [PATCH 08/12] update pthash --- external/pthash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/pthash b/external/pthash index 53b682e..521cc01 160000 --- a/external/pthash +++ b/external/pthash @@ -1 +1 @@ -Subproject commit 53b682e1396936040fb15489b6307a3a94a27093 +Subproject commit 521cc018932ea73d63b84e9ac066fe94020edfd2 From 06557f6b85fd8715613137f45d3932ef1c584ebd Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Tue, 7 Oct 2025 18:17:23 +0200 Subject: [PATCH 09/12] Single-threaded build_sparse_index --- include/builder/build_sparse_index.hpp | 90 ++++++-------------------- 1 file changed, 18 insertions(+), 72 deletions(-) diff --git a/include/builder/build_sparse_index.hpp b/include/builder/build_sparse_index.hpp index 17beff9..32f32f6 100644 --- a/include/builder/build_sparse_index.hpp +++ b/include/builder/build_sparse_index.hpp @@ -29,17 +29,12 @@ struct bucket_size_iterator { template buckets_statistics build_sparse_index(parse_data& data, buckets& m_buckets, - build_configuration const& build_config) // + build_configuration const& /*build_config*/) // { const uint64_t num_kmers = data.num_kmers; const uint64_t num_minimizer_positions = data.minimizers.num_minimizer_positions(); - const uint64_t num_super_kmers = data.minimizers.num_super_kmers(); const uint64_t num_buckets = data.minimizers.num_minimizers(); - const uint64_t num_threads = build_config.num_threads; - bits::compact_vector::builder offsets_builder; - offsets_builder.resize(num_minimizer_positions, - std::ceil(std::log2(data.strings.num_bits() / kmer_t::bits_per_char))); std::cout << "bits_per_offset = ceil(log2(" << data.strings.num_bits() / kmer_t::bits_per_char << ")) = " << std::ceil(std::log2(data.strings.num_bits() / kmer_t::bits_per_char)) @@ -67,74 +62,25 @@ buckets_statistics build_sparse_index(parse_data& data, buckets& buckets_statistics buckets_stats(num_buckets, num_kmers, num_minimizer_positions); timer.start(); - const uint64_t block_size = (num_super_kmers + num_threads - 1) / num_threads; - std::vector offsets; - offsets.reserve(num_threads + 1); - for (uint64_t offset = -1; offset != num_super_kmers;) { - offsets.push_back(offset + 1); - offset = std::min((offset + 1) + block_size, num_super_kmers); - minimizer_tuple const* b = begin + offset; - uint64_t curr_minimizer = (*b).minimizer; - while (b + 1 < end) { // adjust offset - uint64_t next_minimizer = (*(b + 1)).minimizer; - if (curr_minimizer != next_minimizer) break; - b += 1; - offset += 1; + + bits::compact_vector::builder offsets_builder; + offsets_builder.resize(num_minimizer_positions, + std::ceil(std::log2(data.strings.num_bits() / kmer_t::bits_per_char))); + uint64_t prev_minimizer = constants::invalid_uint64, prev_pos_in_seq = constants::invalid_uint64, bucket_size = 0; + for (auto mt : input) { + if (mt.minimizer != prev_minimizer) { + auto [bucket_begin, bucket_end] = m_buckets.locate_bucket(mt.minimizer); + bucket_size = bucket_end - bucket_begin; + buckets_stats.add_bucket_size(bucket_size); + prev_minimizer = mt.minimizer; + prev_pos_in_seq = constants::invalid_uint64; } - } - offsets.push_back(num_super_kmers); - - std::vector threads_buckets_stats; - threads_buckets_stats.reserve(num_threads); - - // Mutex to protect concurrent writes to offsets_builder - // Multiple threads can write to the same bucket positions, and compact_vector - // packs values into 64-bit words, so nearby writes can race on the same word - std::mutex offsets_mutex; - - auto exe = [&](const uint64_t thread_id) { - assert(thread_id + 1 < offsets.size()); - const uint64_t offset_begin = offsets[thread_id]; - const uint64_t offset_end = offsets[thread_id + 1]; - auto& tbs = threads_buckets_stats[thread_id]; - for (minimizers_tuples_iterator it(begin + offset_begin, begin + offset_end); // - it.has_next(); // - it.next()) // - { - const uint64_t bucket_id = it.minimizer(); - const auto [begin, end] = m_buckets.locate_bucket(bucket_id); - assert(end > begin); - const uint64_t bucket_size = end - begin; - assert(bucket_size == it.bucket().size()); - tbs.add_bucket_size(bucket_size); - uint64_t pos = 0; - auto bucket = it.bucket(); - uint64_t prev_pos_in_seq = constants::invalid_uint64; - for (auto mt : bucket) { - if (mt.pos_in_seq != prev_pos_in_seq) { - // Protect compact_vector writes - multiple threads can write to - // positions that share the same 64-bit word - std::lock_guard lock(offsets_mutex); - offsets_builder.set(begin + pos++, mt.pos_in_seq); - prev_pos_in_seq = mt.pos_in_seq; - } - tbs.add_num_kmers_in_super_kmer(bucket_size, mt.num_kmers_in_super_kmer); - } - assert(pos == bucket_size); + buckets_stats.add_num_kmers_in_super_kmer(bucket_size, mt.num_kmers_in_super_kmer); + if (mt.pos_in_seq != prev_pos_in_seq) { + offsets_builder.push_back(mt.pos_in_seq); + prev_pos_in_seq = mt.pos_in_seq; } - }; - - std::vector threads; - threads.reserve(num_threads); - assert(offsets.size() <= num_threads + 1); - for (uint64_t thread_id = 0; thread_id + 1 < size(offsets); ++thread_id) { - threads_buckets_stats.emplace_back(num_buckets, num_kmers, num_minimizer_positions); - threads.emplace_back(exe, thread_id); - } - for (auto& t : threads) { - if (t.joinable()) t.join(); } - for (auto const& tbs : threads_buckets_stats) buckets_stats += tbs; input.close(); timer.stop(); @@ -154,4 +100,4 @@ buckets_statistics build_sparse_index(parse_data& data, buckets& return buckets_stats; } -} // namespace sshash \ No newline at end of file +} // namespace sshash From 64d88946017053d466cc4295d06b8fcf8bf5eb66 Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Wed, 14 Jan 2026 15:56:18 +0100 Subject: [PATCH 10/12] make compiling executables optional --- CMakeLists.txt | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c8bed87..5384ce0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,11 +48,13 @@ else() set(CONDA_BUILD FALSE) endif() +option(SSHASH_BUILD_EXECUTABLES "Build sshash executables" ON) MESSAGE(STATUS "Build type: ${CMAKE_BUILD_TYPE}") MESSAGE(STATUS "Conda build: ${CONDA_BUILD}") MESSAGE(STATUS "Installation prefix: ${CMAKE_INSTALL_PREFIX}") MESSAGE(STATUS "Compiling for processor: ${CMAKE_SYSTEM_PROCESSOR}") MESSAGE(STATUS "Compiling with flags:${CMAKE_CXX_FLAGS}") +MESSAGE(STATUS "Build executables: ${SSHASH_BUILD_EXECUTABLES}") set(Z_LIB_SOURCES external/gz/zip_stream.cpp @@ -85,27 +87,29 @@ add_library(sshash_static STATIC target_include_directories(sshash_static PUBLIC ${SSHASH_INCLUDE_DIRS}) -add_executable(sshash tools/sshash.cpp) -target_include_directories(sshash PUBLIC ${SSHASH_INCLUDE_DIRS}) -target_link_libraries(sshash - z -) +if(SSHASH_BUILD_EXECUTABLES) + add_executable(sshash tools/sshash.cpp) + target_include_directories(sshash PUBLIC ${SSHASH_INCLUDE_DIRS}) + target_link_libraries(sshash + z + ) -# tests: + # tests: -add_executable(test_alphabet test/test_alphabet.cpp) -target_link_libraries(test_alphabet - sshash_static -) - -add_executable(check test/check.cpp) -target_link_libraries(check - sshash_static - z -) + add_executable(test_alphabet test/test_alphabet.cpp) + target_link_libraries(test_alphabet + sshash_static + ) -if (CONDA_BUILD) - install(TARGETS sshash - RUNTIME DESTINATION bin + add_executable(check test/check.cpp) + target_link_libraries(check + sshash_static + z ) + + if (CONDA_BUILD) + install(TARGETS sshash + RUNTIME DESTINATION bin + ) + endif() endif() From 88153605d5cbfffa90eb5a12c5ad2ddd896375fe Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Mon, 16 Feb 2026 14:02:35 +0100 Subject: [PATCH 11/12] Wrap cityhash in namespace and add accessor methods for metagraph compatibility Changes for metagraph integration: 1. Wrap cityhash in namespace to avoid typedef pollution - cityhash.hpp and cityhash.cpp: Wrapped in 'cityhash' namespace - Prevents conflicts with rollinghashcpp uint64 typedef - hash_util.hpp: Use cityhash::CityHash128WithSeed instead of static CityMurmur - tools/query.cpp: Changed uint64 to uint64_t (standard type) 2. Add public accessor methods to dictionary - Added kmer_type typedef for easier template parameter extraction - Added strings() accessor returning m_spss.strings bit vector - Added strings_offsets() accessor returning m_spss.strings_offsets - Moved strings_offsets() next to string_offsets() for better organization 3. Build system changes - CMakeLists.txt: Added cityhash.cpp to SSHASH_SOURCES - CMakeLists.txt: Link sshash executable against sshash_static (includes cityhash) 4. Update pthash submodule - Now tracking ratschlab/pthash master with compute_empirical_entropy fix - .gitmodules: Updated URL to point to ratschlab/pthash --- .gitmodules | 2 +- CMakeLists.txt | 6 ++++++ external/cityhash/cityhash.cpp | 6 +++++- external/cityhash/cityhash.hpp | 4 ++++ external/pthash | 2 +- include/dictionary.hpp | 6 ++++++ include/hash_util.hpp | 6 +++--- tools/query.cpp | 2 +- 8 files changed, 27 insertions(+), 7 deletions(-) diff --git a/.gitmodules b/.gitmodules index a45b147..9f3579f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "external/pthash"] path = external/pthash - url = https://github.com/jermp/pthash.git + url = https://github.com/ratschlab/pthash.git diff --git a/CMakeLists.txt b/CMakeLists.txt index ed43720..f917ba9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,6 +60,10 @@ set(Z_LIB_SOURCES external/gz/zip_stream.cpp ) +set(CITYHASH_SOURCES + external/cityhash/cityhash.cpp +) + set(SSHASH_SOURCES src/build.cpp src/dictionary.cpp @@ -81,6 +85,7 @@ set(SSHASH_INCLUDE_DIRS # Create a static lib add_library(sshash_static STATIC ${Z_LIB_SOURCES} + ${CITYHASH_SOURCES} ${SSHASH_SOURCES} ) @@ -90,6 +95,7 @@ if(SSHASH_BUILD_EXECUTABLES) add_executable(sshash tools/sshash.cpp) target_include_directories(sshash PUBLIC ${SSHASH_INCLUDE_DIRS}) target_link_libraries(sshash + sshash_static z ) diff --git a/external/cityhash/cityhash.cpp b/external/cityhash/cityhash.cpp index 50cafc5..7e81ded 100644 --- a/external/cityhash/cityhash.cpp +++ b/external/cityhash/cityhash.cpp @@ -34,6 +34,8 @@ #include #include // for memcpy and memset +namespace cityhash { + using namespace std; static uint64 UNALIGNED_LOAD64(const char* p) { @@ -442,4 +444,6 @@ uint128 CityHashCrc128(const char* s, size_t len) { } } -#endif \ No newline at end of file +#endif + +} // namespace cityhash \ No newline at end of file diff --git a/external/cityhash/cityhash.hpp b/external/cityhash/cityhash.hpp index 23fb877..801f919 100644 --- a/external/cityhash/cityhash.hpp +++ b/external/cityhash/cityhash.hpp @@ -48,6 +48,8 @@ #include // for size_t. #include +namespace cityhash { + // Microsoft Visual Studio may not have stdint.h. #if defined(_MSC_VER) && (_MSC_VER < 1600) typedef unsigned char uint8_t; @@ -112,4 +114,6 @@ void CityHashCrc256(const char* s, size_t len, uint64* result); #endif // __SSE4_2__ +} // namespace cityhash + #endif // CITY_HASH_H_ \ No newline at end of file diff --git a/external/pthash b/external/pthash index cdeee6e..ee5ced4 160000 --- a/external/pthash +++ b/external/pthash @@ -1 +1 @@ -Subproject commit cdeee6ef1cc97d7cdb022e5d2d0013981335ee3e +Subproject commit ee5ced44af81b8a5b01762252363b2b9d9287c6e diff --git a/include/dictionary.hpp b/include/dictionary.hpp index d0fa3db..a30b8c4 100644 --- a/include/dictionary.hpp +++ b/include/dictionary.hpp @@ -68,6 +68,9 @@ struct dictionary // /* Return the string of the kmer whose id is kmer_id. */ void access(uint64_t kmer_id, char* string_kmer) const; + /* Accessor for internal bit vector */ + bits::bit_vector const& strings() const { return m_spss.strings; } + /* Membership queries. */ bool is_member(char const* string_kmer, bool check_reverse_complement = true) const; bool is_member(Kmer uint_kmer, bool check_reverse_complement = true) const; @@ -104,6 +107,9 @@ struct dictionary // return m_spss.string_offsets(string_id); } + /* Accessor for internal offsets structure */ + Offsets const& strings_offsets() const { return m_spss.strings_offsets; } + iterator at_string_id(const uint64_t string_id) const { assert(string_id < num_strings()); auto [begin, end] = string_offsets(string_id); diff --git a/include/hash_util.hpp b/include/hash_util.hpp index d6b4fbc..dddb0c2 100644 --- a/include/hash_util.hpp +++ b/include/hash_util.hpp @@ -1,7 +1,7 @@ #pragma once #include "external/pthash/include/pthash.hpp" -#include "external/cityhash/cityhash.cpp" +#include "external/cityhash/cityhash.hpp" #include "constants.hpp" namespace sshash { @@ -10,7 +10,7 @@ struct minimizers_city_hasher_128 { typedef pthash::hash128 hash_type; static inline pthash::hash128 hash(uint64_t const minimizer, uint64_t seed) { - auto ret = CityMurmur(reinterpret_cast(&minimizer), // + auto ret = cityhash::CityHash128WithSeed(reinterpret_cast(&minimizer), // sizeof(minimizer), {seed, ~seed}); return {ret.first, ret.second}; } @@ -60,7 +60,7 @@ struct kmers_city_hasher_128 { typedef pthash::hash128 hash_type; static inline pthash::hash128 hash(Kmer const x, uint64_t seed) { - auto ret = CityMurmur(reinterpret_cast(&(x.bits)), // + auto ret = cityhash::CityHash128WithSeed(reinterpret_cast(&(x.bits)), // sizeof(x.bits), {seed, ~seed}); return {ret.first, ret.second}; } diff --git a/tools/query.cpp b/tools/query.cpp index 67eec13..0ae64ff 100644 --- a/tools/query.cpp +++ b/tools/query.cpp @@ -41,7 +41,7 @@ int query(int argc, char** argv) { query_stats.add("num_invalid_kmers", report.num_invalid_kmers); query_stats.add("num_searches", report.num_searches); query_stats.add("num_extensions", report.num_extensions); - query_stats.add("elapsed_millisec", uint64(t.elapsed())); + query_stats.add("elapsed_millisec", uint64_t(t.elapsed())); std::cout << "==== query report:\n"; std::cout << "num_kmers = " << report.num_kmers << std::endl; From 964d891b0d95838ef937e7e8c6e33bcc1496931f Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Mon, 16 Feb 2026 20:14:33 +0100 Subject: [PATCH 12/12] Fix sparse index bit budget calculation and verbose flag handling Fix two critical issues discovered during metagraph integration: 1. Sparse index encoding bug: - The sparse index encoder uses list_id in control codewords for buckets of size 2-64, but was allocating bits based on offset size (num_bits_per_offset + 1) instead of list_id range. - list_id counts buckets within each size category and can reach the total number of minimizers in worst case (all same size). - Fixed by tracking max_buckets_per_size during statistics collection and using: max(num_bits_per_offset + 1, 2 + 6 + ceil(log2(max_buckets_per_size + 1))) - This provides tight bound with minimal overhead (~2-3% worst case). 2. Verbose flag handling: - build_stats.print() was called unconditionally in dictionary_builder.hpp, ignoring build_config.verbose flag. - Fixed by making print() conditional: if (build_config.verbose) - This respects the library's own configuration mechanism. Implementation: - Added m_max_buckets_per_size member to buckets_statistics class - Modified add_bucket_size() to track maximum incrementally - Zero time overhead (uses std::max with existing data) - Clean encapsulation within statistics class --- include/buckets_statistics.hpp | 18 +++-- .../builder/build_sparse_and_skew_index.cpp | 65 ++++++++++++------- include/builder/dictionary_builder.hpp | 2 +- 3 files changed, 57 insertions(+), 28 deletions(-) diff --git a/include/buckets_statistics.hpp b/include/buckets_statistics.hpp index 05e7a3e..da77124 100644 --- a/include/buckets_statistics.hpp +++ b/include/buckets_statistics.hpp @@ -11,14 +11,16 @@ struct buckets_statistics { , m_num_kmers(0) , m_num_minimizer_positions(0) , m_max_num_kmers_in_super_kmer(0) - , m_max_bucket_size(0) {} + , m_max_bucket_size(0) + , m_max_buckets_per_size(0) {} buckets_statistics(uint64_t num_buckets, uint64_t num_kmers, uint64_t num_minimizer_positions) : m_num_buckets(num_buckets) , m_num_kmers(num_kmers) , m_num_minimizer_positions(num_minimizer_positions) , m_max_num_kmers_in_super_kmer(0) - , m_max_bucket_size(0) // + , m_max_bucket_size(0) + , m_max_buckets_per_size(0) // { m_bucket_sizes.resize(MAX_BUCKET_SIZE + 1, 0); m_total_kmers.resize(MAX_BUCKET_SIZE + 1, 0); @@ -26,8 +28,11 @@ struct buckets_statistics { } void add_bucket_size(uint64_t bucket_size) { - if (bucket_size < MAX_BUCKET_SIZE + 1) { m_bucket_sizes[bucket_size] += 1; } - if (bucket_size > m_max_bucket_size) { m_max_bucket_size = bucket_size; } + if (bucket_size < MAX_BUCKET_SIZE + 1) { + m_bucket_sizes[bucket_size] += 1; + m_max_buckets_per_size = std::max(m_max_buckets_per_size, m_bucket_sizes[bucket_size]); + } + m_max_bucket_size = std::max(m_max_bucket_size, bucket_size); } void add_num_kmers_in_super_kmer(uint64_t bucket_size, @@ -49,6 +54,7 @@ struct buckets_statistics { uint64_t num_minimizer_positions() const { return m_num_minimizer_positions; } uint64_t max_num_kmers_in_super_kmer() const { return m_max_num_kmers_in_super_kmer; } uint64_t max_bucket_size() const { return m_max_bucket_size; } + uint64_t max_buckets_per_size() const { return m_max_buckets_per_size; } void print_full() const { std::cout << "=== bucket statistics (full) === \n"; @@ -138,6 +144,9 @@ struct buckets_statistics { if (rhs.max_bucket_size() > m_max_bucket_size) { m_max_bucket_size = rhs.max_bucket_size(); } + if (rhs.max_buckets_per_size() > m_max_buckets_per_size) { + m_max_buckets_per_size = rhs.max_buckets_per_size(); + } assert(m_bucket_sizes.size() == rhs.m_bucket_sizes.size()); for (uint64_t i = 0; i != m_bucket_sizes.size(); ++i) { @@ -160,6 +169,7 @@ struct buckets_statistics { uint64_t m_max_num_kmers_in_super_kmer; uint64_t m_max_bucket_size; + uint64_t m_max_buckets_per_size; std::vector m_bucket_sizes; std::vector m_total_kmers; diff --git a/include/builder/build_sparse_and_skew_index.cpp b/include/builder/build_sparse_and_skew_index.cpp index 9e9f1ce..26ee343 100644 --- a/include/builder/build_sparse_and_skew_index.cpp +++ b/include/builder/build_sparse_and_skew_index.cpp @@ -15,13 +15,6 @@ void dictionary_builder::build_sparse_and_skew_index( const uint64_t min_size = 1ULL << constants::min_l; const uint64_t num_bits_per_offset = strings_offsets_builder.num_bits_per_offset(); - if (build_config.verbose) { - std::cout << "num_bits_per_offset = " << num_bits_per_offset << std::endl; - } - - bits::compact_vector::builder control_codewords_builder; - control_codewords_builder.resize(num_minimizers, num_bits_per_offset + 1); - mm::file_source input(minimizers.get_minimizers_filename(), mm::advice::sequential); @@ -33,11 +26,10 @@ void dictionary_builder::build_sparse_and_skew_index( uint64_t num_minimizer_positions_of_buckets_larger_than_1 = 0; uint64_t num_minimizer_positions_of_buckets_in_skew_index = 0; + // First pass: collect bucket statistics to compute tighter bound for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); // it.has_next(); it.next()) // { - const uint64_t bucket_id = it.minimizer(); - assert(bucket_id < num_minimizers); auto bucket = it.bucket(); const uint64_t bucket_size = bucket.size(); buckets_stats.add_bucket_size(bucket_size); @@ -53,24 +45,31 @@ void dictionary_builder::build_sparse_and_skew_index( num_super_kmers_in_buckets_larger_than_1 += bucket.num_super_kmers(); } - uint64_t prev_pos_in_seq = constants::invalid_uint64; for (auto mt : bucket) { - if (bucket_size == 1 and mt.pos_in_seq != prev_pos_in_seq) { - /* - For minimizers occurring once, store a (log(N)+1)-bit - code, as follows: |offset|0|, i.e., the LSB is 0. - */ - uint64_t code = mt.pos_in_seq << 1; // first LS bit encodes status code: 0 - assert(code < (uint64_t(1) << (num_bits_per_offset + 1))); - control_codewords_builder.set(bucket_id, code); - prev_pos_in_seq = mt.pos_in_seq; - } buckets_stats.add_num_kmers_in_super_kmer(bucket_size, mt.num_kmers_in_super_kmer); } } assert(buckets_stats.num_buckets() == num_minimizers); + // Calculate bits needed for control codewords encoding: + // Encoding format: ((list_id << min_l) | (bucket_size - 2)) << 2 | status_code + // We need: 2 bits (status) + min_l bits (bucket_size) + bits for list_id + // list_id is bounded by the maximum number of buckets sharing the same size + const uint64_t bits_for_list_id = std::ceil(std::log2(buckets_stats.max_buckets_per_size() + 1)); + const uint64_t num_bits_for_control = std::max(num_bits_per_offset + 1, + 2 + constants::min_l + bits_for_list_id); + + if (build_config.verbose) { + std::cout << "num_bits_per_offset = " << num_bits_per_offset << std::endl; + std::cout << "max_list_id = " << buckets_stats.max_buckets_per_size() << std::endl; + std::cout << "bits_for_list_id = " << bits_for_list_id << std::endl; + std::cout << "num_bits_for_control = " << num_bits_for_control << std::endl; + } + + bits::compact_vector::builder control_codewords_builder; + control_codewords_builder.resize(num_minimizers, num_bits_for_control); + strings_offsets_builder.build(d.m_spss.strings_offsets); strings_builder.build(d.m_spss.strings); @@ -100,11 +99,31 @@ void dictionary_builder::build_sparse_and_skew_index( std::vector tuples; // backed memory tuples.reserve(num_super_kmers_in_buckets_larger_than_1); + // Second pass: collect buckets > 1 for sorting AND handle size-1 buckets for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); // it.has_next(); it.next()) // { + const uint64_t bucket_id = it.minimizer(); auto bucket = it.bucket(); - if (bucket.size() > 1) { + const uint64_t bucket_size = bucket.size(); + + if (bucket_size == 1) { + // Handle size-1 buckets: encode directly into control codewords + uint64_t prev_pos_in_seq = constants::invalid_uint64; + for (auto mt : bucket) { + if (mt.pos_in_seq != prev_pos_in_seq) { + /* + For minimizers occurring once, store a (log(N)+1)-bit + code, as follows: |offset|0|, i.e., the LSB is 0. + */ + uint64_t code = mt.pos_in_seq << 1; // first LS bit encodes status code: 0 + assert(code < (uint64_t(1) << num_bits_for_control)); + control_codewords_builder.set(bucket_id, code); + prev_pos_in_seq = mt.pos_in_seq; + } + } + } else { + // Collect buckets > 1 for later processing minimizer_tuple const* begin = tuples.data() + tuples.size(); std::copy(bucket.begin_ptr(), bucket.end_ptr(), std::back_inserter(tuples)); minimizer_tuple const* end = tuples.data() + tuples.size(); @@ -187,7 +206,7 @@ void dictionary_builder::build_sparse_and_skew_index( if (prev_pos_in_seq == constants::invalid_uint64) { // only once uint64_t p = (list_id << constants::min_l) | (curr_bucket_size - 2); uint64_t code = (p << 2) | 1; // first two LS bits encode status code: 01 - assert(code < (uint64_t(1) << (num_bits_per_offset + 1))); + assert(code < (uint64_t(1) << num_bits_for_control)); control_codewords_builder.set(mt.minimizer, code); } if (mt.pos_in_seq != prev_pos_in_seq) { @@ -204,7 +223,7 @@ void dictionary_builder::build_sparse_and_skew_index( assert(partition_id < 8); uint64_t p = (heavy_load_buckets_size << 3) | partition_id; uint64_t code = (p << 2) | 3; // first two LS bits encode status code: 11 - assert(code < (uint64_t(1) << (num_bits_per_offset + 1))); + assert(code < (uint64_t(1) << num_bits_for_control)); control_codewords_builder.set(mt.minimizer, code); } if (mt.pos_in_seq != prev_pos_in_seq) { diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp index 65703c0..efc2230 100644 --- a/include/builder/dictionary_builder.hpp +++ b/include/builder/dictionary_builder.hpp @@ -75,7 +75,7 @@ struct dictionary_builder // build_stats.add("index_size_in_bytes", (d.num_bits() + 7) / 8); build_stats.add("num_kmers", d.num_kmers()); - build_stats.print(); + if (build_config.verbose) build_stats.print(); } build_configuration build_config;