From bf1ee85ae46691a67d20a7305d498c041c4e1138 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Mon, 16 Feb 2026 20:37:22 +0100 Subject: [PATCH 1/2] Fix sparse index bug and improve library integration This commit fixes a bug and adds improvements for better integration as a library dependency: 1. Fix sparse index bit budget calculation: - The sparse index encoder uses list_id in control codewords for buckets of size 2-64, but was allocating bits based only on num_bits_per_offset + 1 (offset size). - list_id is a counter within each bucket size category that can reach the total number of minimizers in worst case (when all minimizers fall into buckets of the same size). - This mismatch causes assertion failures on repetitive sequences. - Fixed by tracking max_buckets_per_size and ensuring sufficient bits: max(num_bits_per_offset + 1, 2 + 6 + ceil(log2(max_buckets_per_size + 1))) - No behavior change on typical datasets: if the previous estimate was sufficient, the new one will be smaller and use the old value. For datasets where the assert failed, this fix enables support. - Implementation uses incremental tracking in buckets_statistics via std::max (zero time overhead, no additional pass). 2. Make verbose output respect build_config flag: - build_stats.print() was called unconditionally in dictionary_builder.hpp. - Now respects build_config.verbose flag for better control. 3. Namespace cityhash to avoid typedef conflicts: - cityhash defines a uint64 typedef that can conflict with other libraries (we encountered conflicts with rollinghashcpp's uint64). - Wrapped cityhash in 'cityhash' namespace to prevent pollution. - Updated hash_util.hpp to use cityhash::CityHash128WithSeed instead of the static CityMurmur function, which is mathematically equivalent for the hash sizes used in sshash. - Updated tools/query.cpp to use standard uint64_t type. 4. Add dictionary accessor methods: - Added strings() and strings_offsets() accessors to enable custom operations on dictionary internals without exposing all members. 5. Add SSHASH_BUILD_EXECUTABLES build option: - New CMake option (default ON) allows building sshash as a library without executables, improving integration as a dependency. - Added CITYHASH_SOURCES and properly link cityhash into sshash_static. - Wrapped all executable builds in conditional for flexibility. --- CMakeLists.txt | 48 ++++++++------ external/cityhash/cityhash.cpp | 6 +- external/cityhash/cityhash.hpp | 4 ++ include/buckets_statistics.hpp | 18 +++-- .../builder/build_sparse_and_skew_index.cpp | 65 ++++++++++++------- include/builder/dictionary_builder.hpp | 2 +- include/dictionary.hpp | 6 ++ include/hash_util.hpp | 6 +- tools/query.cpp | 2 +- 9 files changed, 105 insertions(+), 52 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 327d025..f917ba9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,16 +48,22 @@ else() set(CONDA_BUILD FALSE) endif() +option(SSHASH_BUILD_EXECUTABLES "Build sshash executables" ON) MESSAGE(STATUS "Build type: ${CMAKE_BUILD_TYPE}") MESSAGE(STATUS "Conda build: ${CONDA_BUILD}") MESSAGE(STATUS "Installation prefix: ${CMAKE_INSTALL_PREFIX}") MESSAGE(STATUS "Compiling for processor: ${CMAKE_SYSTEM_PROCESSOR}") MESSAGE(STATUS "Compiling with flags:${CMAKE_CXX_FLAGS}") +MESSAGE(STATUS "Build executables: ${SSHASH_BUILD_EXECUTABLES}") set(Z_LIB_SOURCES external/gz/zip_stream.cpp ) +set(CITYHASH_SOURCES + external/cityhash/cityhash.cpp +) + set(SSHASH_SOURCES src/build.cpp src/dictionary.cpp @@ -79,32 +85,36 @@ set(SSHASH_INCLUDE_DIRS # Create a static lib add_library(sshash_static STATIC ${Z_LIB_SOURCES} + ${CITYHASH_SOURCES} ${SSHASH_SOURCES} ) target_include_directories(sshash_static PUBLIC ${SSHASH_INCLUDE_DIRS}) -add_executable(sshash tools/sshash.cpp) -target_include_directories(sshash PUBLIC ${SSHASH_INCLUDE_DIRS}) -target_link_libraries(sshash - z -) - -# tests: +if(SSHASH_BUILD_EXECUTABLES) + add_executable(sshash tools/sshash.cpp) + target_include_directories(sshash PUBLIC ${SSHASH_INCLUDE_DIRS}) + target_link_libraries(sshash + sshash_static + z + ) -add_executable(test_alphabet test/test_alphabet.cpp) -target_link_libraries(test_alphabet - sshash_static -) + # tests: -add_executable(check test/check.cpp) -target_link_libraries(check - sshash_static - z -) + add_executable(test_alphabet test/test_alphabet.cpp) + target_link_libraries(test_alphabet + sshash_static + ) -if (CONDA_BUILD) - install(TARGETS sshash - RUNTIME DESTINATION bin + add_executable(check test/check.cpp) + target_link_libraries(check + sshash_static + z ) + + if (CONDA_BUILD) + install(TARGETS sshash + RUNTIME DESTINATION bin + ) + endif() endif() diff --git a/external/cityhash/cityhash.cpp b/external/cityhash/cityhash.cpp index 50cafc5..7e81ded 100644 --- a/external/cityhash/cityhash.cpp +++ b/external/cityhash/cityhash.cpp @@ -34,6 +34,8 @@ #include #include // for memcpy and memset +namespace cityhash { + using namespace std; static uint64 UNALIGNED_LOAD64(const char* p) { @@ -442,4 +444,6 @@ uint128 CityHashCrc128(const char* s, size_t len) { } } -#endif \ No newline at end of file +#endif + +} // namespace cityhash \ No newline at end of file diff --git a/external/cityhash/cityhash.hpp b/external/cityhash/cityhash.hpp index 23fb877..801f919 100644 --- a/external/cityhash/cityhash.hpp +++ b/external/cityhash/cityhash.hpp @@ -48,6 +48,8 @@ #include // for size_t. #include +namespace cityhash { + // Microsoft Visual Studio may not have stdint.h. #if defined(_MSC_VER) && (_MSC_VER < 1600) typedef unsigned char uint8_t; @@ -112,4 +114,6 @@ void CityHashCrc256(const char* s, size_t len, uint64* result); #endif // __SSE4_2__ +} // namespace cityhash + #endif // CITY_HASH_H_ \ No newline at end of file diff --git a/include/buckets_statistics.hpp b/include/buckets_statistics.hpp index 05e7a3e..da77124 100644 --- a/include/buckets_statistics.hpp +++ b/include/buckets_statistics.hpp @@ -11,14 +11,16 @@ struct buckets_statistics { , m_num_kmers(0) , m_num_minimizer_positions(0) , m_max_num_kmers_in_super_kmer(0) - , m_max_bucket_size(0) {} + , m_max_bucket_size(0) + , m_max_buckets_per_size(0) {} buckets_statistics(uint64_t num_buckets, uint64_t num_kmers, uint64_t num_minimizer_positions) : m_num_buckets(num_buckets) , m_num_kmers(num_kmers) , m_num_minimizer_positions(num_minimizer_positions) , m_max_num_kmers_in_super_kmer(0) - , m_max_bucket_size(0) // + , m_max_bucket_size(0) + , m_max_buckets_per_size(0) // { m_bucket_sizes.resize(MAX_BUCKET_SIZE + 1, 0); m_total_kmers.resize(MAX_BUCKET_SIZE + 1, 0); @@ -26,8 +28,11 @@ struct buckets_statistics { } void add_bucket_size(uint64_t bucket_size) { - if (bucket_size < MAX_BUCKET_SIZE + 1) { m_bucket_sizes[bucket_size] += 1; } - if (bucket_size > m_max_bucket_size) { m_max_bucket_size = bucket_size; } + if (bucket_size < MAX_BUCKET_SIZE + 1) { + m_bucket_sizes[bucket_size] += 1; + m_max_buckets_per_size = std::max(m_max_buckets_per_size, m_bucket_sizes[bucket_size]); + } + m_max_bucket_size = std::max(m_max_bucket_size, bucket_size); } void add_num_kmers_in_super_kmer(uint64_t bucket_size, @@ -49,6 +54,7 @@ struct buckets_statistics { uint64_t num_minimizer_positions() const { return m_num_minimizer_positions; } uint64_t max_num_kmers_in_super_kmer() const { return m_max_num_kmers_in_super_kmer; } uint64_t max_bucket_size() const { return m_max_bucket_size; } + uint64_t max_buckets_per_size() const { return m_max_buckets_per_size; } void print_full() const { std::cout << "=== bucket statistics (full) === \n"; @@ -138,6 +144,9 @@ struct buckets_statistics { if (rhs.max_bucket_size() > m_max_bucket_size) { m_max_bucket_size = rhs.max_bucket_size(); } + if (rhs.max_buckets_per_size() > m_max_buckets_per_size) { + m_max_buckets_per_size = rhs.max_buckets_per_size(); + } assert(m_bucket_sizes.size() == rhs.m_bucket_sizes.size()); for (uint64_t i = 0; i != m_bucket_sizes.size(); ++i) { @@ -160,6 +169,7 @@ struct buckets_statistics { uint64_t m_max_num_kmers_in_super_kmer; uint64_t m_max_bucket_size; + uint64_t m_max_buckets_per_size; std::vector m_bucket_sizes; std::vector m_total_kmers; diff --git a/include/builder/build_sparse_and_skew_index.cpp b/include/builder/build_sparse_and_skew_index.cpp index 9e9f1ce..26ee343 100644 --- a/include/builder/build_sparse_and_skew_index.cpp +++ b/include/builder/build_sparse_and_skew_index.cpp @@ -15,13 +15,6 @@ void dictionary_builder::build_sparse_and_skew_index( const uint64_t min_size = 1ULL << constants::min_l; const uint64_t num_bits_per_offset = strings_offsets_builder.num_bits_per_offset(); - if (build_config.verbose) { - std::cout << "num_bits_per_offset = " << num_bits_per_offset << std::endl; - } - - bits::compact_vector::builder control_codewords_builder; - control_codewords_builder.resize(num_minimizers, num_bits_per_offset + 1); - mm::file_source input(minimizers.get_minimizers_filename(), mm::advice::sequential); @@ -33,11 +26,10 @@ void dictionary_builder::build_sparse_and_skew_index( uint64_t num_minimizer_positions_of_buckets_larger_than_1 = 0; uint64_t num_minimizer_positions_of_buckets_in_skew_index = 0; + // First pass: collect bucket statistics to compute tighter bound for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); // it.has_next(); it.next()) // { - const uint64_t bucket_id = it.minimizer(); - assert(bucket_id < num_minimizers); auto bucket = it.bucket(); const uint64_t bucket_size = bucket.size(); buckets_stats.add_bucket_size(bucket_size); @@ -53,24 +45,31 @@ void dictionary_builder::build_sparse_and_skew_index( num_super_kmers_in_buckets_larger_than_1 += bucket.num_super_kmers(); } - uint64_t prev_pos_in_seq = constants::invalid_uint64; for (auto mt : bucket) { - if (bucket_size == 1 and mt.pos_in_seq != prev_pos_in_seq) { - /* - For minimizers occurring once, store a (log(N)+1)-bit - code, as follows: |offset|0|, i.e., the LSB is 0. - */ - uint64_t code = mt.pos_in_seq << 1; // first LS bit encodes status code: 0 - assert(code < (uint64_t(1) << (num_bits_per_offset + 1))); - control_codewords_builder.set(bucket_id, code); - prev_pos_in_seq = mt.pos_in_seq; - } buckets_stats.add_num_kmers_in_super_kmer(bucket_size, mt.num_kmers_in_super_kmer); } } assert(buckets_stats.num_buckets() == num_minimizers); + // Calculate bits needed for control codewords encoding: + // Encoding format: ((list_id << min_l) | (bucket_size - 2)) << 2 | status_code + // We need: 2 bits (status) + min_l bits (bucket_size) + bits for list_id + // list_id is bounded by the maximum number of buckets sharing the same size + const uint64_t bits_for_list_id = std::ceil(std::log2(buckets_stats.max_buckets_per_size() + 1)); + const uint64_t num_bits_for_control = std::max(num_bits_per_offset + 1, + 2 + constants::min_l + bits_for_list_id); + + if (build_config.verbose) { + std::cout << "num_bits_per_offset = " << num_bits_per_offset << std::endl; + std::cout << "max_list_id = " << buckets_stats.max_buckets_per_size() << std::endl; + std::cout << "bits_for_list_id = " << bits_for_list_id << std::endl; + std::cout << "num_bits_for_control = " << num_bits_for_control << std::endl; + } + + bits::compact_vector::builder control_codewords_builder; + control_codewords_builder.resize(num_minimizers, num_bits_for_control); + strings_offsets_builder.build(d.m_spss.strings_offsets); strings_builder.build(d.m_spss.strings); @@ -100,11 +99,31 @@ void dictionary_builder::build_sparse_and_skew_index( std::vector tuples; // backed memory tuples.reserve(num_super_kmers_in_buckets_larger_than_1); + // Second pass: collect buckets > 1 for sorting AND handle size-1 buckets for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); // it.has_next(); it.next()) // { + const uint64_t bucket_id = it.minimizer(); auto bucket = it.bucket(); - if (bucket.size() > 1) { + const uint64_t bucket_size = bucket.size(); + + if (bucket_size == 1) { + // Handle size-1 buckets: encode directly into control codewords + uint64_t prev_pos_in_seq = constants::invalid_uint64; + for (auto mt : bucket) { + if (mt.pos_in_seq != prev_pos_in_seq) { + /* + For minimizers occurring once, store a (log(N)+1)-bit + code, as follows: |offset|0|, i.e., the LSB is 0. + */ + uint64_t code = mt.pos_in_seq << 1; // first LS bit encodes status code: 0 + assert(code < (uint64_t(1) << num_bits_for_control)); + control_codewords_builder.set(bucket_id, code); + prev_pos_in_seq = mt.pos_in_seq; + } + } + } else { + // Collect buckets > 1 for later processing minimizer_tuple const* begin = tuples.data() + tuples.size(); std::copy(bucket.begin_ptr(), bucket.end_ptr(), std::back_inserter(tuples)); minimizer_tuple const* end = tuples.data() + tuples.size(); @@ -187,7 +206,7 @@ void dictionary_builder::build_sparse_and_skew_index( if (prev_pos_in_seq == constants::invalid_uint64) { // only once uint64_t p = (list_id << constants::min_l) | (curr_bucket_size - 2); uint64_t code = (p << 2) | 1; // first two LS bits encode status code: 01 - assert(code < (uint64_t(1) << (num_bits_per_offset + 1))); + assert(code < (uint64_t(1) << num_bits_for_control)); control_codewords_builder.set(mt.minimizer, code); } if (mt.pos_in_seq != prev_pos_in_seq) { @@ -204,7 +223,7 @@ void dictionary_builder::build_sparse_and_skew_index( assert(partition_id < 8); uint64_t p = (heavy_load_buckets_size << 3) | partition_id; uint64_t code = (p << 2) | 3; // first two LS bits encode status code: 11 - assert(code < (uint64_t(1) << (num_bits_per_offset + 1))); + assert(code < (uint64_t(1) << num_bits_for_control)); control_codewords_builder.set(mt.minimizer, code); } if (mt.pos_in_seq != prev_pos_in_seq) { diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp index 65703c0..efc2230 100644 --- a/include/builder/dictionary_builder.hpp +++ b/include/builder/dictionary_builder.hpp @@ -75,7 +75,7 @@ struct dictionary_builder // build_stats.add("index_size_in_bytes", (d.num_bits() + 7) / 8); build_stats.add("num_kmers", d.num_kmers()); - build_stats.print(); + if (build_config.verbose) build_stats.print(); } build_configuration build_config; diff --git a/include/dictionary.hpp b/include/dictionary.hpp index d0fa3db..a30b8c4 100644 --- a/include/dictionary.hpp +++ b/include/dictionary.hpp @@ -68,6 +68,9 @@ struct dictionary // /* Return the string of the kmer whose id is kmer_id. */ void access(uint64_t kmer_id, char* string_kmer) const; + /* Accessor for internal bit vector */ + bits::bit_vector const& strings() const { return m_spss.strings; } + /* Membership queries. */ bool is_member(char const* string_kmer, bool check_reverse_complement = true) const; bool is_member(Kmer uint_kmer, bool check_reverse_complement = true) const; @@ -104,6 +107,9 @@ struct dictionary // return m_spss.string_offsets(string_id); } + /* Accessor for internal offsets structure */ + Offsets const& strings_offsets() const { return m_spss.strings_offsets; } + iterator at_string_id(const uint64_t string_id) const { assert(string_id < num_strings()); auto [begin, end] = string_offsets(string_id); diff --git a/include/hash_util.hpp b/include/hash_util.hpp index d6b4fbc..dddb0c2 100644 --- a/include/hash_util.hpp +++ b/include/hash_util.hpp @@ -1,7 +1,7 @@ #pragma once #include "external/pthash/include/pthash.hpp" -#include "external/cityhash/cityhash.cpp" +#include "external/cityhash/cityhash.hpp" #include "constants.hpp" namespace sshash { @@ -10,7 +10,7 @@ struct minimizers_city_hasher_128 { typedef pthash::hash128 hash_type; static inline pthash::hash128 hash(uint64_t const minimizer, uint64_t seed) { - auto ret = CityMurmur(reinterpret_cast(&minimizer), // + auto ret = cityhash::CityHash128WithSeed(reinterpret_cast(&minimizer), // sizeof(minimizer), {seed, ~seed}); return {ret.first, ret.second}; } @@ -60,7 +60,7 @@ struct kmers_city_hasher_128 { typedef pthash::hash128 hash_type; static inline pthash::hash128 hash(Kmer const x, uint64_t seed) { - auto ret = CityMurmur(reinterpret_cast(&(x.bits)), // + auto ret = cityhash::CityHash128WithSeed(reinterpret_cast(&(x.bits)), // sizeof(x.bits), {seed, ~seed}); return {ret.first, ret.second}; } diff --git a/tools/query.cpp b/tools/query.cpp index 67eec13..0ae64ff 100644 --- a/tools/query.cpp +++ b/tools/query.cpp @@ -41,7 +41,7 @@ int query(int argc, char** argv) { query_stats.add("num_invalid_kmers", report.num_invalid_kmers); query_stats.add("num_searches", report.num_searches); query_stats.add("num_extensions", report.num_extensions); - query_stats.add("elapsed_millisec", uint64(t.elapsed())); + query_stats.add("elapsed_millisec", uint64_t(t.elapsed())); std::cout << "==== query report:\n"; std::cout << "num_kmers = " << report.num_kmers << std::endl; From 7fdd93aa00b3be09adaa87c65d0113f9ad80f802 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Mon, 16 Feb 2026 22:49:52 +0100 Subject: [PATCH 2/2] Refine sparse index fix: track only buckets with size > 1 - Renamed m_max_buckets_per_size to m_max_sparse_buckets_per_size for clarity - Only update counter for bucket_size > 1 (buckets that use list_id encoding) - Size-1 buckets use direct offset encoding, not the sparse index --- include/buckets_statistics.hpp | 16 +++++++++------- include/builder/build_sparse_and_skew_index.cpp | 4 ++-- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/include/buckets_statistics.hpp b/include/buckets_statistics.hpp index da77124..2ff2bab 100644 --- a/include/buckets_statistics.hpp +++ b/include/buckets_statistics.hpp @@ -12,7 +12,7 @@ struct buckets_statistics { , m_num_minimizer_positions(0) , m_max_num_kmers_in_super_kmer(0) , m_max_bucket_size(0) - , m_max_buckets_per_size(0) {} + , m_max_sparse_buckets_per_size(0) {} buckets_statistics(uint64_t num_buckets, uint64_t num_kmers, uint64_t num_minimizer_positions) : m_num_buckets(num_buckets) @@ -20,7 +20,7 @@ struct buckets_statistics { , m_num_minimizer_positions(num_minimizer_positions) , m_max_num_kmers_in_super_kmer(0) , m_max_bucket_size(0) - , m_max_buckets_per_size(0) // + , m_max_sparse_buckets_per_size(0) // { m_bucket_sizes.resize(MAX_BUCKET_SIZE + 1, 0); m_total_kmers.resize(MAX_BUCKET_SIZE + 1, 0); @@ -30,7 +30,9 @@ struct buckets_statistics { void add_bucket_size(uint64_t bucket_size) { if (bucket_size < MAX_BUCKET_SIZE + 1) { m_bucket_sizes[bucket_size] += 1; - m_max_buckets_per_size = std::max(m_max_buckets_per_size, m_bucket_sizes[bucket_size]); + if (bucket_size > 1) { + m_max_sparse_buckets_per_size = std::max(m_max_sparse_buckets_per_size, m_bucket_sizes[bucket_size]); + } } m_max_bucket_size = std::max(m_max_bucket_size, bucket_size); } @@ -54,7 +56,7 @@ struct buckets_statistics { uint64_t num_minimizer_positions() const { return m_num_minimizer_positions; } uint64_t max_num_kmers_in_super_kmer() const { return m_max_num_kmers_in_super_kmer; } uint64_t max_bucket_size() const { return m_max_bucket_size; } - uint64_t max_buckets_per_size() const { return m_max_buckets_per_size; } + uint64_t max_sparse_buckets_per_size() const { return m_max_sparse_buckets_per_size; } void print_full() const { std::cout << "=== bucket statistics (full) === \n"; @@ -144,8 +146,8 @@ struct buckets_statistics { if (rhs.max_bucket_size() > m_max_bucket_size) { m_max_bucket_size = rhs.max_bucket_size(); } - if (rhs.max_buckets_per_size() > m_max_buckets_per_size) { - m_max_buckets_per_size = rhs.max_buckets_per_size(); + if (rhs.max_sparse_buckets_per_size() > m_max_sparse_buckets_per_size) { + m_max_sparse_buckets_per_size = rhs.max_sparse_buckets_per_size(); } assert(m_bucket_sizes.size() == rhs.m_bucket_sizes.size()); @@ -169,7 +171,7 @@ struct buckets_statistics { uint64_t m_max_num_kmers_in_super_kmer; uint64_t m_max_bucket_size; - uint64_t m_max_buckets_per_size; + uint64_t m_max_sparse_buckets_per_size; std::vector m_bucket_sizes; std::vector m_total_kmers; diff --git a/include/builder/build_sparse_and_skew_index.cpp b/include/builder/build_sparse_and_skew_index.cpp index 26ee343..4fe0dab 100644 --- a/include/builder/build_sparse_and_skew_index.cpp +++ b/include/builder/build_sparse_and_skew_index.cpp @@ -56,13 +56,13 @@ void dictionary_builder::build_sparse_and_skew_index( // Encoding format: ((list_id << min_l) | (bucket_size - 2)) << 2 | status_code // We need: 2 bits (status) + min_l bits (bucket_size) + bits for list_id // list_id is bounded by the maximum number of buckets sharing the same size - const uint64_t bits_for_list_id = std::ceil(std::log2(buckets_stats.max_buckets_per_size() + 1)); + const uint64_t bits_for_list_id = std::ceil(std::log2(buckets_stats.max_sparse_buckets_per_size() + 1)); const uint64_t num_bits_for_control = std::max(num_bits_per_offset + 1, 2 + constants::min_l + bits_for_list_id); if (build_config.verbose) { std::cout << "num_bits_per_offset = " << num_bits_per_offset << std::endl; - std::cout << "max_list_id = " << buckets_stats.max_buckets_per_size() << std::endl; + std::cout << "max_list_id = " << buckets_stats.max_sparse_buckets_per_size() << std::endl; std::cout << "bits_for_list_id = " << bits_for_list_id << std::endl; std::cout << "num_bits_for_control = " << num_bits_for_control << std::endl; }