From bf1ee85ae46691a67d20a7305d498c041c4e1138 Mon Sep 17 00:00:00 2001
From: Oleksandr Kulkov <adamant.pwn@gmail.com>
Date: Mon, 16 Feb 2026 20:37:22 +0100
Subject: [PATCH 1/2] Fix sparse index bug and improve library integration

This commit fixes a bug and adds improvements for better
integration as a library dependency:

1. Fix sparse index bit budget calculation:
   - The sparse index encoder uses list_id in control codewords for
     buckets of size 2-64, but was allocating bits based only on
     num_bits_per_offset + 1 (offset size).
   - list_id is a counter within each bucket size category that can
     reach the total number of minimizers in worst case (when all
     minimizers fall into buckets of the same size).
   - This mismatch causes assertion failures on repetitive sequences.
   - Fixed by tracking max_buckets_per_size and ensuring sufficient bits:
     max(num_bits_per_offset + 1, 2 + 6 + ceil(log2(max_buckets_per_size + 1)))
   - No behavior change on typical datasets: if the previous estimate
     was sufficient, the new one will be smaller and use the old value.
     For datasets where the assert failed, this fix enables support.
   - Implementation uses incremental tracking in buckets_statistics
     via std::max (zero time overhead, no additional pass).

2. Make verbose output respect build_config flag:
   - build_stats.print() was called unconditionally in dictionary_builder.hpp.
   - Now respects build_config.verbose flag for better control.

3. Namespace cityhash to avoid typedef conflicts:
   - cityhash defines a uint64 typedef that can conflict with other
     libraries (we encountered conflicts with rollinghashcpp's uint64).
   - Wrapped cityhash in 'cityhash' namespace to prevent pollution.
   - Updated hash_util.hpp to use cityhash::CityHash128WithSeed instead
     of the static CityMurmur function, which is mathematically equivalent
     for the hash sizes used in sshash.
   - Updated tools/query.cpp to use standard uint64_t type.

4. Add dictionary accessor methods:
   - Added strings() and strings_offsets() accessors to enable custom
     operations on dictionary internals without exposing all members.

5. Add SSHASH_BUILD_EXECUTABLES build option:
   - New CMake option (default ON) allows building sshash as a library
     without executables, improving integration as a dependency.
   - Added CITYHASH_SOURCES and properly link cityhash into sshash_static.
   - Wrapped all executable builds in conditional for flexibility.
---
 CMakeLists.txt                                | 48 ++++++++------
 external/cityhash/cityhash.cpp                |  6 +-
 external/cityhash/cityhash.hpp                |  4 ++
 include/buckets_statistics.hpp                | 18 +++--
 .../builder/build_sparse_and_skew_index.cpp   | 65 ++++++++++++-------
 include/builder/dictionary_builder.hpp        |  2 +-
 include/dictionary.hpp                        |  6 ++
 include/hash_util.hpp                         |  6 +-
 tools/query.cpp                               |  2 +-
 9 files changed, 105 insertions(+), 52 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 327d025..f917ba9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,16 +48,22 @@ else()
   set(CONDA_BUILD FALSE)
 endif()
 
+option(SSHASH_BUILD_EXECUTABLES "Build sshash executables" ON)
 MESSAGE(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 MESSAGE(STATUS "Conda build: ${CONDA_BUILD}")
 MESSAGE(STATUS "Installation prefix: ${CMAKE_INSTALL_PREFIX}")
 MESSAGE(STATUS "Compiling for processor: ${CMAKE_SYSTEM_PROCESSOR}")
 MESSAGE(STATUS "Compiling with flags:${CMAKE_CXX_FLAGS}")
+MESSAGE(STATUS "Build executables: ${SSHASH_BUILD_EXECUTABLES}")
 
 set(Z_LIB_SOURCES
   external/gz/zip_stream.cpp
 )
 
+set(CITYHASH_SOURCES
+  external/cityhash/cityhash.cpp
+)
+
 set(SSHASH_SOURCES
   src/build.cpp
   src/dictionary.cpp
@@ -79,32 +85,36 @@ set(SSHASH_INCLUDE_DIRS
 # Create a static lib
 add_library(sshash_static STATIC
   ${Z_LIB_SOURCES}
+  ${CITYHASH_SOURCES}
   ${SSHASH_SOURCES}
 )
 
 target_include_directories(sshash_static PUBLIC ${SSHASH_INCLUDE_DIRS})
 
-add_executable(sshash tools/sshash.cpp)
-target_include_directories(sshash PUBLIC ${SSHASH_INCLUDE_DIRS})
-target_link_libraries(sshash
-  z
-)
-
-# tests:
+if(SSHASH_BUILD_EXECUTABLES)
+  add_executable(sshash tools/sshash.cpp)
+  target_include_directories(sshash PUBLIC ${SSHASH_INCLUDE_DIRS})
+  target_link_libraries(sshash
+    sshash_static
+    z
+  )
 
-add_executable(test_alphabet test/test_alphabet.cpp)
-target_link_libraries(test_alphabet
-  sshash_static
-)
+  # tests:
 
-add_executable(check test/check.cpp)
-target_link_libraries(check
-  sshash_static
-  z
-)
+  add_executable(test_alphabet test/test_alphabet.cpp)
+  target_link_libraries(test_alphabet
+    sshash_static
+  )
 
-if (CONDA_BUILD)
-  install(TARGETS sshash
-    RUNTIME DESTINATION bin
+  add_executable(check test/check.cpp)
+  target_link_libraries(check
+    sshash_static
+    z
   )
+
+  if (CONDA_BUILD)
+    install(TARGETS sshash
+      RUNTIME DESTINATION bin
+    )
+  endif()
 endif()
diff --git a/external/cityhash/cityhash.cpp b/external/cityhash/cityhash.cpp
index 50cafc5..7e81ded 100644
--- a/external/cityhash/cityhash.cpp
+++ b/external/cityhash/cityhash.cpp
@@ -34,6 +34,8 @@
 #include <algorithm>
 #include <string.h>  // for memcpy and memset
 
+namespace cityhash {
+
 using namespace std;
 
 static uint64 UNALIGNED_LOAD64(const char* p) {
@@ -442,4 +444,6 @@ uint128 CityHashCrc128(const char* s, size_t len) {
     }
 }
 
-#endif
\ No newline at end of file
+#endif
+
+}  // namespace cityhash
\ No newline at end of file
diff --git a/external/cityhash/cityhash.hpp b/external/cityhash/cityhash.hpp
index 23fb877..801f919 100644
--- a/external/cityhash/cityhash.hpp
+++ b/external/cityhash/cityhash.hpp
@@ -48,6 +48,8 @@
 #include <stdlib.h>  // for size_t.
 #include <utility>
 
+namespace cityhash {
+
 // Microsoft Visual Studio may not have stdint.h.
 #if defined(_MSC_VER) && (_MSC_VER < 1600)
 typedef unsigned char uint8_t;
@@ -112,4 +114,6 @@ void CityHashCrc256(const char* s, size_t len, uint64* result);
 
 #endif  // __SSE4_2__
 
+}  // namespace cityhash
+
 #endif  // CITY_HASH_H_
\ No newline at end of file
diff --git a/include/buckets_statistics.hpp b/include/buckets_statistics.hpp
index 05e7a3e..da77124 100644
--- a/include/buckets_statistics.hpp
+++ b/include/buckets_statistics.hpp
@@ -11,14 +11,16 @@ struct buckets_statistics {
         , m_num_kmers(0)
         , m_num_minimizer_positions(0)
         , m_max_num_kmers_in_super_kmer(0)
-        , m_max_bucket_size(0) {}
+        , m_max_bucket_size(0)
+        , m_max_buckets_per_size(0) {}
 
     buckets_statistics(uint64_t num_buckets, uint64_t num_kmers, uint64_t num_minimizer_positions)
         : m_num_buckets(num_buckets)
         , m_num_kmers(num_kmers)
         , m_num_minimizer_positions(num_minimizer_positions)
         , m_max_num_kmers_in_super_kmer(0)
-        , m_max_bucket_size(0)  //
+        , m_max_bucket_size(0)
+        , m_max_buckets_per_size(0)  //
     {
         m_bucket_sizes.resize(MAX_BUCKET_SIZE + 1, 0);
         m_total_kmers.resize(MAX_BUCKET_SIZE + 1, 0);
@@ -26,8 +28,11 @@ struct buckets_statistics {
     }
 
     void add_bucket_size(uint64_t bucket_size) {
-        if (bucket_size < MAX_BUCKET_SIZE + 1) { m_bucket_sizes[bucket_size] += 1; }
-        if (bucket_size > m_max_bucket_size) { m_max_bucket_size = bucket_size; }
+        if (bucket_size < MAX_BUCKET_SIZE + 1) {
+            m_bucket_sizes[bucket_size] += 1;
+            m_max_buckets_per_size = std::max(m_max_buckets_per_size, m_bucket_sizes[bucket_size]);
+        }
+        m_max_bucket_size = std::max(m_max_bucket_size, bucket_size);
     }
 
     void add_num_kmers_in_super_kmer(uint64_t bucket_size,
@@ -49,6 +54,7 @@ struct buckets_statistics {
     uint64_t num_minimizer_positions() const { return m_num_minimizer_positions; }
     uint64_t max_num_kmers_in_super_kmer() const { return m_max_num_kmers_in_super_kmer; }
     uint64_t max_bucket_size() const { return m_max_bucket_size; }
+    uint64_t max_buckets_per_size() const { return m_max_buckets_per_size; }
 
     void print_full() const {
         std::cout << "=== bucket statistics (full) === \n";
@@ -138,6 +144,9 @@ struct buckets_statistics {
         if (rhs.max_bucket_size() > m_max_bucket_size) {
             m_max_bucket_size = rhs.max_bucket_size();
         }
+        if (rhs.max_buckets_per_size() > m_max_buckets_per_size) {
+            m_max_buckets_per_size = rhs.max_buckets_per_size();
+        }
 
         assert(m_bucket_sizes.size() == rhs.m_bucket_sizes.size());
         for (uint64_t i = 0; i != m_bucket_sizes.size(); ++i) {
@@ -160,6 +169,7 @@ struct buckets_statistics {
 
     uint64_t m_max_num_kmers_in_super_kmer;
     uint64_t m_max_bucket_size;
+    uint64_t m_max_buckets_per_size;
 
     std::vector<uint64_t> m_bucket_sizes;
     std::vector<uint64_t> m_total_kmers;
diff --git a/include/builder/build_sparse_and_skew_index.cpp b/include/builder/build_sparse_and_skew_index.cpp
index 9e9f1ce..26ee343 100644
--- a/include/builder/build_sparse_and_skew_index.cpp
+++ b/include/builder/build_sparse_and_skew_index.cpp
@@ -15,13 +15,6 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     const uint64_t min_size = 1ULL << constants::min_l;
     const uint64_t num_bits_per_offset = strings_offsets_builder.num_bits_per_offset();
 
-    if (build_config.verbose) {
-        std::cout << "num_bits_per_offset = " << num_bits_per_offset << std::endl;
-    }
-
-    bits::compact_vector::builder control_codewords_builder;
-    control_codewords_builder.resize(num_minimizers, num_bits_per_offset + 1);
-
     mm::file_source<minimizer_tuple> input(minimizers.get_minimizers_filename(),
                                            mm::advice::sequential);
 
@@ -33,11 +26,10 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     uint64_t num_minimizer_positions_of_buckets_larger_than_1 = 0;
     uint64_t num_minimizer_positions_of_buckets_in_skew_index = 0;
 
+    // First pass: collect bucket statistics to compute tighter bound
     for (minimizers_tuples_iterator it(input.data(), input.data() + input.size());  //
          it.has_next(); it.next())                                                  //
     {
-        const uint64_t bucket_id = it.minimizer();
-        assert(bucket_id < num_minimizers);
         auto bucket = it.bucket();
         const uint64_t bucket_size = bucket.size();
         buckets_stats.add_bucket_size(bucket_size);
@@ -53,24 +45,31 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
             num_super_kmers_in_buckets_larger_than_1 += bucket.num_super_kmers();
         }
 
-        uint64_t prev_pos_in_seq = constants::invalid_uint64;
         for (auto mt : bucket) {
-            if (bucket_size == 1 and mt.pos_in_seq != prev_pos_in_seq) {
-                /*
-                    For minimizers occurring once, store a (log(N)+1)-bit
-                    code, as follows: |offset|0|, i.e., the LSB is 0.
-                */
-                uint64_t code = mt.pos_in_seq << 1;  // first LS bit encodes status code: 0
-                assert(code < (uint64_t(1) << (num_bits_per_offset + 1)));
-                control_codewords_builder.set(bucket_id, code);
-                prev_pos_in_seq = mt.pos_in_seq;
-            }
             buckets_stats.add_num_kmers_in_super_kmer(bucket_size, mt.num_kmers_in_super_kmer);
         }
     }
 
     assert(buckets_stats.num_buckets() == num_minimizers);
 
+    // Calculate bits needed for control codewords encoding:
+    // Encoding format: ((list_id << min_l) | (bucket_size - 2)) << 2 | status_code
+    // We need: 2 bits (status) + min_l bits (bucket_size) + bits for list_id
+    // list_id is bounded by the maximum number of buckets sharing the same size
+    const uint64_t bits_for_list_id = std::ceil(std::log2(buckets_stats.max_buckets_per_size() + 1));
+    const uint64_t num_bits_for_control = std::max(num_bits_per_offset + 1,
+                                                     2 + constants::min_l + bits_for_list_id);
+
+    if (build_config.verbose) {
+        std::cout << "num_bits_per_offset = " << num_bits_per_offset << std::endl;
+        std::cout << "max_list_id = " << buckets_stats.max_buckets_per_size() << std::endl;
+        std::cout << "bits_for_list_id = " << bits_for_list_id << std::endl;
+        std::cout << "num_bits_for_control = " << num_bits_for_control << std::endl;
+    }
+
+    bits::compact_vector::builder control_codewords_builder;
+    control_codewords_builder.resize(num_minimizers, num_bits_for_control);
+
     strings_offsets_builder.build(d.m_spss.strings_offsets);
     strings_builder.build(d.m_spss.strings);
 
@@ -100,11 +99,31 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     std::vector<minimizer_tuple> tuples;  // backed memory
     tuples.reserve(num_super_kmers_in_buckets_larger_than_1);
 
+    // Second pass: collect buckets > 1 for sorting AND handle size-1 buckets
     for (minimizers_tuples_iterator it(input.data(), input.data() + input.size());  //
          it.has_next(); it.next())                                                  //
     {
+        const uint64_t bucket_id = it.minimizer();
         auto bucket = it.bucket();
-        if (bucket.size() > 1) {
+        const uint64_t bucket_size = bucket.size();
+        
+        if (bucket_size == 1) {
+            // Handle size-1 buckets: encode directly into control codewords
+            uint64_t prev_pos_in_seq = constants::invalid_uint64;
+            for (auto mt : bucket) {
+                if (mt.pos_in_seq != prev_pos_in_seq) {
+                    /*
+                        For minimizers occurring once, store a (log(N)+1)-bit
+                        code, as follows: |offset|0|, i.e., the LSB is 0.
+                    */
+                    uint64_t code = mt.pos_in_seq << 1;  // first LS bit encodes status code: 0
+                    assert(code < (uint64_t(1) << num_bits_for_control));
+                    control_codewords_builder.set(bucket_id, code);
+                    prev_pos_in_seq = mt.pos_in_seq;
+                }
+            }
+        } else {
+            // Collect buckets > 1 for later processing
             minimizer_tuple const* begin = tuples.data() + tuples.size();
             std::copy(bucket.begin_ptr(), bucket.end_ptr(), std::back_inserter(tuples));
             minimizer_tuple const* end = tuples.data() + tuples.size();
@@ -187,7 +206,7 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
                     if (prev_pos_in_seq == constants::invalid_uint64) {  // only once
                         uint64_t p = (list_id << constants::min_l) | (curr_bucket_size - 2);
                         uint64_t code = (p << 2) | 1;  // first two LS bits encode status code: 01
-                        assert(code < (uint64_t(1) << (num_bits_per_offset + 1)));
+                        assert(code < (uint64_t(1) << num_bits_for_control));
                         control_codewords_builder.set(mt.minimizer, code);
                     }
                     if (mt.pos_in_seq != prev_pos_in_seq) {
@@ -204,7 +223,7 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
                         assert(partition_id < 8);
                         uint64_t p = (heavy_load_buckets_size << 3) | partition_id;
                         uint64_t code = (p << 2) | 3;  // first two LS bits encode status code: 11
-                        assert(code < (uint64_t(1) << (num_bits_per_offset + 1)));
+                        assert(code < (uint64_t(1) << num_bits_for_control));
                         control_codewords_builder.set(mt.minimizer, code);
                     }
                     if (mt.pos_in_seq != prev_pos_in_seq) {
diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp
index 65703c0..efc2230 100644
--- a/include/builder/dictionary_builder.hpp
+++ b/include/builder/dictionary_builder.hpp
@@ -75,7 +75,7 @@ struct dictionary_builder  //
         build_stats.add("index_size_in_bytes", (d.num_bits() + 7) / 8);
         build_stats.add("num_kmers", d.num_kmers());
 
-        build_stats.print();
+        if (build_config.verbose) build_stats.print();
     }
 
     build_configuration build_config;
diff --git a/include/dictionary.hpp b/include/dictionary.hpp
index d0fa3db..a30b8c4 100644
--- a/include/dictionary.hpp
+++ b/include/dictionary.hpp
@@ -68,6 +68,9 @@ struct dictionary  //
     /* Return the string of the kmer whose id is kmer_id. */
     void access(uint64_t kmer_id, char* string_kmer) const;
 
+    /* Accessor for internal bit vector */
+    bits::bit_vector const& strings() const { return m_spss.strings; }
+
     /* Membership queries. */
     bool is_member(char const* string_kmer, bool check_reverse_complement = true) const;
     bool is_member(Kmer uint_kmer, bool check_reverse_complement = true) const;
@@ -104,6 +107,9 @@ struct dictionary  //
         return m_spss.string_offsets(string_id);
     }
 
+    /* Accessor for internal offsets structure */
+    Offsets const& strings_offsets() const { return m_spss.strings_offsets; }
+
     iterator at_string_id(const uint64_t string_id) const {
         assert(string_id < num_strings());
         auto [begin, end] = string_offsets(string_id);
diff --git a/include/hash_util.hpp b/include/hash_util.hpp
index d6b4fbc..dddb0c2 100644
--- a/include/hash_util.hpp
+++ b/include/hash_util.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "external/pthash/include/pthash.hpp"
-#include "external/cityhash/cityhash.cpp"
+#include "external/cityhash/cityhash.hpp"
 #include "constants.hpp"
 
 namespace sshash {
@@ -10,7 +10,7 @@ struct minimizers_city_hasher_128 {
     typedef pthash::hash128 hash_type;
 
     static inline pthash::hash128 hash(uint64_t const minimizer, uint64_t seed) {
-        auto ret = CityMurmur(reinterpret_cast<char const*>(&minimizer),  //
+        auto ret = cityhash::CityHash128WithSeed(reinterpret_cast<char const*>(&minimizer),  //
                               sizeof(minimizer), {seed, ~seed});
         return {ret.first, ret.second};
     }
@@ -60,7 +60,7 @@ struct kmers_city_hasher_128 {
     typedef pthash::hash128 hash_type;
 
     static inline pthash::hash128 hash(Kmer const x, uint64_t seed) {
-        auto ret = CityMurmur(reinterpret_cast<char const*>(&(x.bits)),  //
+        auto ret = cityhash::CityHash128WithSeed(reinterpret_cast<char const*>(&(x.bits)),  //
                               sizeof(x.bits), {seed, ~seed});
         return {ret.first, ret.second};
     }
diff --git a/tools/query.cpp b/tools/query.cpp
index 67eec13..0ae64ff 100644
--- a/tools/query.cpp
+++ b/tools/query.cpp
@@ -41,7 +41,7 @@ int query(int argc, char** argv) {
     query_stats.add("num_invalid_kmers", report.num_invalid_kmers);
     query_stats.add("num_searches", report.num_searches);
     query_stats.add("num_extensions", report.num_extensions);
-    query_stats.add("elapsed_millisec", uint64(t.elapsed()));
+    query_stats.add("elapsed_millisec", uint64_t(t.elapsed()));
 
     std::cout << "==== query report:\n";
     std::cout << "num_kmers = " << report.num_kmers << std::endl;

From 7fdd93aa00b3be09adaa87c65d0113f9ad80f802 Mon Sep 17 00:00:00 2001
From: Oleksandr Kulkov <adamant.pwn@gmail.com>
Date: Mon, 16 Feb 2026 22:49:52 +0100
Subject: [PATCH 2/2] Refine sparse index fix: track only buckets with size > 1

- Renamed m_max_buckets_per_size to m_max_sparse_buckets_per_size for clarity
- Only update counter for bucket_size > 1 (buckets that use list_id encoding)
- Size-1 buckets use direct offset encoding, not the sparse index
---
 include/buckets_statistics.hpp                  | 16 +++++++++-------
 include/builder/build_sparse_and_skew_index.cpp |  4 ++--
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/include/buckets_statistics.hpp b/include/buckets_statistics.hpp
index da77124..2ff2bab 100644
--- a/include/buckets_statistics.hpp
+++ b/include/buckets_statistics.hpp
@@ -12,7 +12,7 @@ struct buckets_statistics {
         , m_num_minimizer_positions(0)
         , m_max_num_kmers_in_super_kmer(0)
         , m_max_bucket_size(0)
-        , m_max_buckets_per_size(0) {}
+        , m_max_sparse_buckets_per_size(0) {}
 
     buckets_statistics(uint64_t num_buckets, uint64_t num_kmers, uint64_t num_minimizer_positions)
         : m_num_buckets(num_buckets)
@@ -20,7 +20,7 @@ struct buckets_statistics {
         , m_num_minimizer_positions(num_minimizer_positions)
         , m_max_num_kmers_in_super_kmer(0)
         , m_max_bucket_size(0)
-        , m_max_buckets_per_size(0)  //
+        , m_max_sparse_buckets_per_size(0)  //
     {
         m_bucket_sizes.resize(MAX_BUCKET_SIZE + 1, 0);
         m_total_kmers.resize(MAX_BUCKET_SIZE + 1, 0);
@@ -30,7 +30,9 @@ struct buckets_statistics {
     void add_bucket_size(uint64_t bucket_size) {
         if (bucket_size < MAX_BUCKET_SIZE + 1) {
             m_bucket_sizes[bucket_size] += 1;
-            m_max_buckets_per_size = std::max(m_max_buckets_per_size, m_bucket_sizes[bucket_size]);
+            if (bucket_size > 1) {
+                m_max_sparse_buckets_per_size = std::max(m_max_sparse_buckets_per_size, m_bucket_sizes[bucket_size]);
+            }
         }
         m_max_bucket_size = std::max(m_max_bucket_size, bucket_size);
     }
@@ -54,7 +56,7 @@ struct buckets_statistics {
     uint64_t num_minimizer_positions() const { return m_num_minimizer_positions; }
     uint64_t max_num_kmers_in_super_kmer() const { return m_max_num_kmers_in_super_kmer; }
     uint64_t max_bucket_size() const { return m_max_bucket_size; }
-    uint64_t max_buckets_per_size() const { return m_max_buckets_per_size; }
+    uint64_t max_sparse_buckets_per_size() const { return m_max_sparse_buckets_per_size; }
 
     void print_full() const {
         std::cout << "=== bucket statistics (full) === \n";
@@ -144,8 +146,8 @@ struct buckets_statistics {
         if (rhs.max_bucket_size() > m_max_bucket_size) {
             m_max_bucket_size = rhs.max_bucket_size();
         }
-        if (rhs.max_buckets_per_size() > m_max_buckets_per_size) {
-            m_max_buckets_per_size = rhs.max_buckets_per_size();
+        if (rhs.max_sparse_buckets_per_size() > m_max_sparse_buckets_per_size) {
+            m_max_sparse_buckets_per_size = rhs.max_sparse_buckets_per_size();
         }
 
         assert(m_bucket_sizes.size() == rhs.m_bucket_sizes.size());
@@ -169,7 +171,7 @@ struct buckets_statistics {
 
     uint64_t m_max_num_kmers_in_super_kmer;
     uint64_t m_max_bucket_size;
-    uint64_t m_max_buckets_per_size;
+    uint64_t m_max_sparse_buckets_per_size;
 
     std::vector<uint64_t> m_bucket_sizes;
     std::vector<uint64_t> m_total_kmers;
diff --git a/include/builder/build_sparse_and_skew_index.cpp b/include/builder/build_sparse_and_skew_index.cpp
index 26ee343..4fe0dab 100644
--- a/include/builder/build_sparse_and_skew_index.cpp
+++ b/include/builder/build_sparse_and_skew_index.cpp
@@ -56,13 +56,13 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     // Encoding format: ((list_id << min_l) | (bucket_size - 2)) << 2 | status_code
     // We need: 2 bits (status) + min_l bits (bucket_size) + bits for list_id
     // list_id is bounded by the maximum number of buckets sharing the same size
-    const uint64_t bits_for_list_id = std::ceil(std::log2(buckets_stats.max_buckets_per_size() + 1));
+    const uint64_t bits_for_list_id = std::ceil(std::log2(buckets_stats.max_sparse_buckets_per_size() + 1));
     const uint64_t num_bits_for_control = std::max(num_bits_per_offset + 1,
                                                      2 + constants::min_l + bits_for_list_id);
 
     if (build_config.verbose) {
         std::cout << "num_bits_per_offset = " << num_bits_per_offset << std::endl;
-        std::cout << "max_list_id = " << buckets_stats.max_buckets_per_size() << std::endl;
+        std::cout << "max_list_id = " << buckets_stats.max_sparse_buckets_per_size() << std::endl;
         std::cout << "bits_for_list_id = " << bits_for_list_id << std::endl;
         std::cout << "num_bits_for_control = " << num_bits_for_control << std::endl;
     }