From 9007363a6d53b3b4f9a8f6a9c4f48d07d9cb402a Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Fri, 10 Apr 2026 19:04:32 +0530 Subject: [PATCH 01/32] pushed old changes made when travelling in Nepal: thinner bucket_type --- include/builder/util.hpp | 39 ++++++++++----------- src/builder/build_sparse_and_skew_index.cpp | 26 ++++++++------ 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/include/builder/util.hpp b/include/builder/util.hpp index 81efc34..31888d3 100644 --- a/include/builder/util.hpp +++ b/include/builder/util.hpp @@ -60,22 +60,7 @@ inline std::ostream& operator<<(std::ostream& os, minimizer_tuple const& mt) { struct bucket_type { bucket_type(minimizer_tuple const* begin, minimizer_tuple const* end) - : m_begin(begin) - , m_end(end) - , m_num_super_kmers(std::distance(begin, end)) - , m_num_minimizer_positions(0) // - { - uint64_t prev_pos_in_seq = constants::invalid_uint64; - while (begin != end) { - uint64_t pos_in_seq = (*begin).pos_in_seq; - if (pos_in_seq != prev_pos_in_seq) { - ++m_num_minimizer_positions; - prev_pos_in_seq = pos_in_seq; - } - ++begin; - } - assert(m_num_minimizer_positions <= m_num_super_kmers); - } + : m_begin(begin), m_end(end) {} struct iterator { iterator(minimizer_tuple const* begin) : m_begin(begin) {} @@ -103,8 +88,24 @@ struct bucket_type { So the method size() returns the number of minimizer positions which is <= the number of superkmers. */ - uint64_t num_super_kmers() const { return m_num_super_kmers; } - uint64_t size() const { return m_num_minimizer_positions; } + + uint64_t num_super_kmers() const { return std::distance(m_begin, m_end); } + + uint64_t size() const { + uint64_t num_minimizer_positions = 0; + uint64_t prev_pos_in_seq = constants::invalid_uint64; + auto const* begin = m_begin; + while (begin != m_end) { + uint64_t pos_in_seq = (*begin).pos_in_seq; + if (pos_in_seq != prev_pos_in_seq) { + ++num_minimizer_positions; + prev_pos_in_seq = pos_in_seq; + } + ++begin; + } + assert(num_minimizer_positions <= num_super_kmers()); + return num_minimizer_positions; + } minimizer_tuple const* begin_ptr() const { return m_begin; } minimizer_tuple const* end_ptr() const { return m_end; } @@ -112,8 +113,6 @@ struct bucket_type { private: minimizer_tuple const* m_begin; minimizer_tuple const* m_end; - uint64_t m_num_super_kmers; - uint64_t m_num_minimizer_positions; }; /* diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp index 7ed9886..cae852b 100644 --- a/src/builder/build_sparse_and_skew_index.cpp +++ b/src/builder/build_sparse_and_skew_index.cpp @@ -25,7 +25,9 @@ void dictionary_builder::build_sparse_and_skew_index( uint64_t num_minimizer_positions_of_buckets_larger_than_1 = 0; uint64_t num_minimizer_positions_of_buckets_in_skew_index = 0; - // First pass: collect bucket statistics to compute tighter bound + /* + First pass: collect bucket statistics to compute tighter bound. + */ for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); // it.has_next(); it.next()) // { @@ -51,10 +53,13 @@ void dictionary_builder::build_sparse_and_skew_index( assert(buckets_stats.num_buckets() == num_minimizers); - // Calculate bits needed for control codewords encoding: - // Encoding format: ((list_id << min_l) | (bucket_size - 2)) << 2 | status_code - // We need: 2 bits (status) + min_l bits (bucket_size) + bits for list_id - // list_id is bounded by the maximum number of buckets sharing the same size + /* + Calculate bits needed for control codewords encoding. + Encoding format: + ((list_id << min_l) | (bucket_size - 2)) << 2 | status_code + We need: 2 bits (status) + min_l bits (bucket_size) + bits for list_id. + list_id is bounded by the maximum number of buckets sharing the same size. + */ const uint64_t bits_for_list_id = std::ceil(std::log2(buckets_stats.max_sparse_buckets_per_size() + 1)); const uint64_t num_bits_for_control = @@ -106,7 +111,6 @@ void dictionary_builder::build_sparse_and_skew_index( const uint64_t bucket_id = it.minimizer(); auto bucket = it.bucket(); const uint64_t bucket_size = bucket.size(); - if (bucket_size == 1) { // Handle size-1 buckets: encode directly into control codewords uint64_t prev_pos_in_seq = constants::invalid_uint64; @@ -271,7 +275,8 @@ void dictionary_builder::build_sparse_and_skew_index( for (uint64_t i = buckets.size() - num_buckets_in_skew_index; i <= buckets.size(); ++i) // { auto const& bucket = buckets[i]; - while (i == buckets.size() or bucket.size() > upper) // + const uint64_t bucket_size = bucket.size(); + while (i == buckets.size() or bucket_size > upper) // { if (build_config.verbose) { std::cout << " partition = " << partition_id @@ -291,7 +296,7 @@ void dictionary_builder::build_sparse_and_skew_index( if (i == buckets.size()) break; - assert(bucket.size() > lower and bucket.size() <= upper); + assert(bucket_size > lower and bucket_size <= upper); for (auto mt : bucket) { num_kmers_in_partition[partition_id] += mt.num_kmers_in_super_kmer; } @@ -341,7 +346,8 @@ void dictionary_builder::build_sparse_and_skew_index( i <= buckets.size(); ++i) // { auto const& bucket = buckets[i]; - while (i == buckets.size() or bucket.size() > upper) // + const uint64_t bucket_size = bucket.size(); + while (i == buckets.size() or bucket_size > upper) // { if (build_config.verbose) { std::cout << " lower = " << lower << "; upper = " << upper @@ -441,7 +447,7 @@ void dictionary_builder::build_sparse_and_skew_index( if (i == buckets.size()) break; - assert(bucket.size() > lower and bucket.size() <= upper); + assert(bucket_size > lower and bucket_size <= upper); uint64_t pos_in_bucket = -1; uint64_t prev_pos_in_seq = constants::invalid_uint64; for (auto mt : bucket) // From 54aa00b4f18f3a0f058accbbb3741cead4ffa203 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 20:32:13 +0000 Subject: [PATCH 02/32] fixed issue with minimizers_tuples_iterator Cherry-picked from 1f0cdd1 on master. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- include/builder/util.hpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/include/builder/util.hpp b/include/builder/util.hpp index 31888d3..9b9b209 100644 --- a/include/builder/util.hpp +++ b/include/builder/util.hpp @@ -145,13 +145,10 @@ struct minimizers_tuples_iterator { minimizer_tuple const* m_end; minimizer_tuple const* next_begin() { + if (m_bucket_begin == m_end) return m_end; minimizer_tuple const* begin = m_bucket_begin; - uint64_t prev_minimizer = (*begin).minimizer; - while (begin != m_end) { - ++begin; - uint64_t curr_minimizer = (*begin).minimizer; - if (curr_minimizer != prev_minimizer) break; - } + uint64_t prev_minimizer = begin->minimizer; + while (++begin != m_end and begin->minimizer == prev_minimizer) {} return begin; } }; From 259b7fc4ab1c87a32e6bc631b46fa9afed7cb2f2 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 20:32:26 +0000 Subject: [PATCH 03/32] step 7.2: stream over strings instead of random access The skew-index build used to do a random read into the strings bitvector for every super-kmer in every heavy-load bucket (buckets are visited in size-sorted order, so the resulting positions are essentially random across all of strings). That forces strings to be RAM-resident throughout step 7.2. This commit restructures step 7.2 into three sub-steps: (A) walk the heavy-load buckets and emit one kmer_extraction_request per super-kmer, externally sort+flushed by starting_pos (parallel_sort within a bounded RAM buffer, ~1/4 of --ram-limit). (B) merge the sorted runs and walk strings in a single forward pass; for each request extract the requested kmers and append (kmer.bits, pos_in_bucket) to a per-partition tmp file. Only kmer.bits is serialized to avoid persisting the vptr that uint_kmer_t carries via its virtual destructor. (C) for each partition, read its tmp file, build the MPHF and the positions compact vector. The skew index is assembled partition by partition. The access pattern over strings is now monotonically non-decreasing, which is the precondition for moving strings itself out of RAM in a follow-up change. Correctness verified via `sshash build --check` on salmonella_enterica (m=7), canonical mode, and salmonella_100 with 4 threads. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- src/builder/build_sparse_and_skew_index.cpp | 419 +++++++++++++------- 1 file changed, 267 insertions(+), 152 deletions(-) diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp index cae852b..1720274 100644 --- a/src/builder/build_sparse_and_skew_index.cpp +++ b/src/builder/build_sparse_and_skew_index.cpp @@ -2,6 +2,40 @@ namespace sshash { +/* + A request to extract `num_kmers_in_super_kmer` consecutive k-mers from + `strings` starting at base position `starting_pos`. After requests are + externally sorted by `starting_pos`, k-mer extraction reduces to a single + forward scan over `strings`. +*/ +#pragma pack(push, 4) +struct kmer_extraction_request { + kmer_extraction_request() {} + kmer_extraction_request(uint64_t starting_pos, uint32_t partition_id, + uint32_t pos_in_bucket, uint32_t num_kmers_in_super_kmer) + : starting_pos(starting_pos) + , partition_id(partition_id) + , pos_in_bucket(pos_in_bucket) + , num_kmers_in_super_kmer(num_kmers_in_super_kmer) {} + + bool operator<(kmer_extraction_request const& o) const { + return starting_pos < o.starting_pos; + } + bool operator>(kmer_extraction_request const& o) const { + return starting_pos > o.starting_pos; + } + + static kmer_extraction_request max() { + return kmer_extraction_request(uint64_t(-1), uint32_t(-1), uint32_t(-1), uint32_t(-1)); + } + + uint64_t starting_pos; + uint32_t partition_id; + uint32_t pos_in_bucket; + uint32_t num_kmers_in_super_kmer; +}; +#pragma pack(pop) + template void dictionary_builder::build_sparse_and_skew_index( dictionary& d) // @@ -262,58 +296,196 @@ void dictionary_builder::build_sparse_and_skew_index( return; } - /* step 2. build skew index */ + /* + step 2. build skew index + + We do this in three sub-steps: + (A) walk the heavy-load buckets in size-sorted order, decode each + super-kmer's absolute starting position in `strings` and emit a + `kmer_extraction_request`. Requests are sort+flushed to disk in + chunks (external sort by `starting_pos`). + (B) merge the sorted runs and walk `strings` in a single forward + sequential pass, extracting the requested k-mers. For each k-mer + we append `(kmer.bits, pos_in_bucket)` to a per-partition tmp file. + (C) for each partition, read its tmp file, build the MPHF, then build + the positions compact vector. The skew index is assembled + partition by partition. + + Avoiding the random access pattern over `strings` in (B) is the + precondition for moving `strings` itself out of RAM in a later step. + */ timer.start(); + std::vector num_kmers_in_partition(num_partitions, 0); + /* unique run identifier for the tmp files produced by this step */ + const uint64_t skew_run_id = pthash::clock_type::now().time_since_epoch().count(); + auto request_run_filename = [&](uint64_t id) { + std::stringstream ss; + ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id + << ".kmer_requests." << id << ".bin"; + return ss.str(); + }; + auto skew_partition_filename = [&](uint64_t pid) { + std::stringstream ss; + ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id + << ".skew_kmers." << pid << ".bin"; + return ss.str(); + }; + + /* (A) emit kmer-extraction requests, externally sorted by `starting_pos` */ + std::atomic num_request_runs{0}; { + const uint64_t request_buffer_capacity = std::max( + uint64_t(1) << 16, + (build_config.ram_limit_in_GiB * essentials::GiB) / + (4 * sizeof(kmer_extraction_request))); + + std::vector request_buffer; + request_buffer.reserve(request_buffer_capacity); + + auto flush_request_buffer = [&]() { + if (request_buffer.empty()) return; + parallel_sort(request_buffer, build_config.num_threads, + [](kmer_extraction_request const& a, + kmer_extraction_request const& b) { + return a.starting_pos < b.starting_pos; + }); + const uint64_t id = num_request_runs.fetch_add(1); + const std::string fn = request_run_filename(id); + if (build_config.verbose) { + std::cout << "saving to file '" << fn << "'..." << std::endl; + } + std::ofstream out(fn, std::ofstream::binary); + if (!out.is_open()) throw std::runtime_error("cannot open file"); + out.write(reinterpret_cast(request_buffer.data()), + request_buffer.size() * sizeof(kmer_extraction_request)); + out.close(); + request_buffer.clear(); + }; + uint64_t partition_id = 0; uint64_t lower = min_size; uint64_t upper = 2 * lower; - uint64_t num_kmers_in_skew_index = 0; - for (uint64_t i = buckets.size() - num_buckets_in_skew_index; i <= buckets.size(); ++i) // + for (uint64_t i = buckets.size() - num_buckets_in_skew_index; i < buckets.size(); ++i) // { auto const& bucket = buckets[i]; const uint64_t bucket_size = bucket.size(); - while (i == buckets.size() or bucket_size > upper) // + while (bucket_size > upper) // { - if (build_config.verbose) { - std::cout << " partition = " << partition_id - << ": num kmers in buckets of size > " << lower << " and <= " << upper - << ": " << num_kmers_in_partition[partition_id] << std::endl; - } - - num_kmers_in_skew_index += num_kmers_in_partition[partition_id]; - - if (i == buckets.size()) break; - lower = upper; upper = 2 * lower; partition_id += 1; if (partition_id == num_partitions - 1) upper = max_bucket_size; } - - if (i == buckets.size()) break; - assert(bucket_size > lower and bucket_size <= upper); - for (auto mt : bucket) { + assert(partition_id < num_partitions); + + uint32_t pos_in_bucket = uint32_t(-1); + uint64_t prev_pos_in_seq = constants::invalid_uint64; + for (auto mt : bucket) // + { num_kmers_in_partition[partition_id] += mt.num_kmers_in_super_kmer; + if (mt.pos_in_seq != prev_pos_in_seq) { + prev_pos_in_seq = mt.pos_in_seq; + ++pos_in_bucket; + } + assert(mt.pos_in_seq >= mt.pos_in_kmer); + const uint64_t abs_offset = + d.m_spss.strings_offsets.decode(mt.pos_in_seq).absolute_offset; + const uint64_t starting_pos = abs_offset - mt.pos_in_kmer; + if (request_buffer.size() == request_buffer_capacity) flush_request_buffer(); + request_buffer.emplace_back(starting_pos, // + uint32_t(partition_id), // + pos_in_bucket, // + uint32_t(mt.num_kmers_in_super_kmer)); // } } + flush_request_buffer(); assert(partition_id == num_partitions - 1); + } + + if (build_config.verbose) { + uint64_t total_kmers_in_skew = 0; + for (uint64_t p = 0; p != num_partitions; ++p) { + total_kmers_in_skew += num_kmers_in_partition[p]; + std::cout << " partition = " << p + << ": num kmers in partition = " << num_kmers_in_partition[p] << std::endl; + } + std::cout << "num kmers in skew index = " << total_kmers_in_skew << " (" + << (total_kmers_in_skew * 100.0) / buckets_stats.num_kmers() << "%)" << std::endl; + } - if (build_config.verbose) { - std::cout << "num kmers in skew index = " << num_kmers_in_skew_index << " (" - << (num_kmers_in_skew_index * 100.0) / buckets_stats.num_kmers() << "%)" - << std::endl; + /* (B) sequential extraction over `strings` -> per-partition kmer tmp files */ + { + struct request_run_names_iterator { + request_run_names_iterator(std::string const& tmp_dirname, uint64_t skew_run_id) + : i(0), skew_run_id(skew_run_id), tmp_dirname(tmp_dirname) {} + + std::string operator*() const { + std::stringstream ss; + ss << tmp_dirname << "/sshash.tmp.run_" << skew_run_id + << ".kmer_requests." << i << ".bin"; + return ss.str(); + } + void operator++() { ++i; } + + uint64_t i; + uint64_t skew_run_id; + std::string tmp_dirname; + }; + + request_run_names_iterator names_it(build_config.tmp_dirname, skew_run_id); + file_merging_iterator merger(names_it, num_request_runs.load()); + + std::vector partition_writers(num_partitions); + for (uint64_t p = 0; p != num_partitions; ++p) { + if (num_kmers_in_partition[p] == 0) continue; + partition_writers[p].open(skew_partition_filename(p), + std::ofstream::binary | std::ofstream::trunc); + if (!partition_writers[p].is_open()) { + throw std::runtime_error("cannot open skew-partition tmp file"); + } } - assert(num_kmers_in_skew_index == std::accumulate(num_kmers_in_partition.begin(), - num_kmers_in_partition.end(), - uint64_t(0))); + const uint64_t k = build_config.k; + const bool canonical = build_config.canonical; + kmer_iterator kmer_it(d.m_spss.strings, k); + + while (merger.has_next()) // + { + const kmer_extraction_request req = *merger; + kmer_it.at(Kmer::bits_per_char * req.starting_pos); + for (uint32_t i = 0; i != req.num_kmers_in_super_kmer; ++i) { + Kmer kmer = kmer_it.get(); + if (canonical) { + Kmer kmer_rc = kmer; + kmer_rc.reverse_complement_inplace(k); + kmer = std::min(kmer, kmer_rc); + } + auto& w = partition_writers[req.partition_id]; + /* write only `kmer.bits` (avoids serializing the vptr that + `uint_kmer_t` carries due to its virtual destructor) */ + w.write(reinterpret_cast(&kmer.bits), sizeof(kmer.bits)); + w.write(reinterpret_cast(&req.pos_in_bucket), + sizeof(req.pos_in_bucket)); + kmer_it.next(); + } + merger.next(); + } + merger.close(); + + for (auto& w : partition_writers) { + if (w.is_open()) w.close(); + } + + for (uint64_t i = 0; i != num_request_runs.load(); ++i) { + std::remove(request_run_filename(i).c_str()); + } } + /* (C) per-partition MPHF + positions build */ { std::vector> mphfs; std::vector positions; @@ -329,155 +501,98 @@ void dictionary_builder::build_sparse_and_skew_index( mphf_build_config.num_threads = build_config.num_threads; mphf_build_config.avg_partition_size = constants::avg_partition_size; - uint64_t partition_id = 0; uint64_t lower = min_size; uint64_t upper = 2 * lower; uint64_t num_bits_per_pos = constants::min_l + 1; + if (num_partitions == 1) { + upper = max_bucket_size; + num_bits_per_pos = log2_max_bucket_size; + } - /* Temporary storage for kmers and positions within a partition. */ - std::vector kmers; - std::vector positions_in_bucket; - bits::compact_vector::builder cvb_positions; - kmers.reserve(num_kmers_in_partition[partition_id]); - positions_in_bucket.reserve(num_kmers_in_partition[partition_id]); - cvb_positions.resize(num_kmers_in_partition[partition_id], num_bits_per_pos); - - for (uint64_t i = buckets.size() - num_buckets_in_skew_index, k = build_config.k; - i <= buckets.size(); ++i) // + for (uint64_t partition_id = 0; partition_id != num_partitions; ++partition_id) // { - auto const& bucket = buckets[i]; - const uint64_t bucket_size = bucket.size(); - while (i == buckets.size() or bucket_size > upper) // - { - if (build_config.verbose) { - std::cout << " lower = " << lower << "; upper = " << upper - << "; num_bits_per_pos = " << num_bits_per_pos - << "; num_kmers_in_partition = " << kmers.size() << std::endl; - } - assert(num_kmers_in_partition[partition_id] == kmers.size()); - assert(num_kmers_in_partition[partition_id] == positions_in_bucket.size()); - - if (num_kmers_in_partition[partition_id] > 0) // - { - /*******/ - // { - // uint64_t RAM_available_in_bytes = essentials::GiB; - - // uint64_t RAM_taken_in_bytes = essentials::vec_bytes(buckets) + - // essentials::vec_bytes(tuples) + - - // essentials::vec_bytes(kmers) + - // essentials::vec_bytes(positions_in_bucket) - // + - // essentials::vec_bytes(cvb_positions.data()) - // + - - // d.num_bits() / 8; // current memory - - // std::cout << "RAM_taken_in_bytes = " << RAM_taken_in_bytes << std::endl; - - // const uint64_t RAM_limit_in_bytes = - // build_config.ram_limit_in_GiB * essentials::GiB; - - // if (RAM_limit_in_bytes > RAM_taken_in_bytes) { - // RAM_available_in_bytes = std::max( - // RAM_limit_in_bytes - RAM_taken_in_bytes, RAM_available_in_bytes); - // } - // std::cout << "RAM_available_in_bytes = " << RAM_available_in_bytes - // << std::endl; - - // mphf_build_config.ram = RAM_available_in_bytes / 2; // at least 0.5 GB - // } - /*******/ - - if (build_config.verbose) { - const uint64_t avg_partition_size = - pthash::compute_avg_partition_size(kmers.size(), mphf_build_config); - const uint64_t num_partitions = - pthash::compute_num_partitions(kmers.size(), avg_partition_size); - assert(num_partitions > 0); - std::cout << " building MPHF with " << mphf_build_config.num_threads - << " threads and " << num_partitions - << " partitions (avg. partition size = " << avg_partition_size - << ")..." << std::endl; - } + const uint64_t n = num_kmers_in_partition[partition_id]; - auto& F = mphfs[partition_id]; - F.build_in_internal_memory(kmers.begin(), kmers.size(), mphf_build_config); + if (build_config.verbose) { + std::cout << " lower = " << lower << "; upper = " << upper + << "; num_bits_per_pos = " << num_bits_per_pos + << "; num_kmers_in_partition = " << n << std::endl; + } - if (build_config.verbose) { - std::cout << " built mphs[" << partition_id << "] for " << kmers.size() - << " kmers; bits/key = " - << static_cast(F.num_bits()) / F.num_keys() << std::endl; - } + if (n > 0) // + { + std::vector kmers; + std::vector positions_in_bucket; + kmers.reserve(n); + positions_in_bucket.reserve(n); - for (uint64_t i = 0; i != kmers.size(); ++i) { - Kmer kmer = kmers[i]; - uint64_t pos = F(kmer); - uint32_t pos_in_bucket = positions_in_bucket[i]; - cvb_positions.set(pos, pos_in_bucket); + { + const std::string fn = skew_partition_filename(partition_id); + std::ifstream in(fn, std::ifstream::binary); + if (!in.is_open()) { + throw std::runtime_error("cannot open skew-partition tmp file"); } - auto& P = positions[partition_id]; - cvb_positions.build(P); - - if (build_config.verbose) { - std::cout << " built positions[" << partition_id << "] for " << P.size() - << " kmers; bits/key = " << (P.num_bytes() * 8.0) / P.size() - << std::endl; + for (uint64_t i = 0; i != n; ++i) { + Kmer kmer; + in.read(reinterpret_cast(&kmer.bits), sizeof(kmer.bits)); + uint32_t pib; + in.read(reinterpret_cast(&pib), sizeof(pib)); + kmers.push_back(kmer); + positions_in_bucket.push_back(pib); } + in.close(); + std::remove(fn.c_str()); } - if (i == buckets.size()) break; + bits::compact_vector::builder cvb_positions; + cvb_positions.resize(n, num_bits_per_pos); - lower = upper; - upper = 2 * lower; - num_bits_per_pos += 1; - partition_id += 1; - if (partition_id == num_partitions - 1) { - upper = max_bucket_size; - num_bits_per_pos = log2_max_bucket_size; + if (build_config.verbose) { + const uint64_t avg_partition_size = + pthash::compute_avg_partition_size(kmers.size(), mphf_build_config); + const uint64_t pthash_num_partitions = + pthash::compute_num_partitions(kmers.size(), avg_partition_size); + assert(pthash_num_partitions > 0); + std::cout << " building MPHF with " << mphf_build_config.num_threads + << " threads and " << pthash_num_partitions + << " partitions (avg. partition size = " << avg_partition_size + << ")..." << std::endl; } - kmers.clear(); - positions_in_bucket.clear(); - kmers.reserve(num_kmers_in_partition[partition_id]); - positions_in_bucket.reserve(num_kmers_in_partition[partition_id]); - cvb_positions.resize(num_kmers_in_partition[partition_id], num_bits_per_pos); - } + auto& F = mphfs[partition_id]; + F.build_in_internal_memory(kmers.begin(), kmers.size(), mphf_build_config); - if (i == buckets.size()) break; + if (build_config.verbose) { + std::cout << " built mphs[" << partition_id << "] for " << kmers.size() + << " kmers; bits/key = " + << static_cast(F.num_bits()) / F.num_keys() << std::endl; + } - assert(bucket_size > lower and bucket_size <= upper); - uint64_t pos_in_bucket = -1; - uint64_t prev_pos_in_seq = constants::invalid_uint64; - for (auto mt : bucket) // - { - if (mt.pos_in_seq != prev_pos_in_seq) { - prev_pos_in_seq = mt.pos_in_seq; - ++pos_in_bucket; + for (uint64_t i = 0; i != kmers.size(); ++i) { + Kmer kmer = kmers[i]; + uint64_t pos = F(kmer); + uint32_t pos_in_bucket = positions_in_bucket[i]; + cvb_positions.set(pos, pos_in_bucket); } - assert(mt.pos_in_seq >= mt.pos_in_kmer); + auto& P = positions[partition_id]; + cvb_positions.build(P); - mt.pos_in_seq = d.m_spss.strings_offsets.decode(mt.pos_in_seq).absolute_offset; - - const uint64_t starting_pos_of_super_kmer = mt.pos_in_seq - mt.pos_in_kmer; - kmer_iterator it( - d.m_spss.strings, k, Kmer::bits_per_char * starting_pos_of_super_kmer); - for (uint64_t i = 0; i != mt.num_kmers_in_super_kmer; ++i) { - auto kmer = it.get(); - if (build_config.canonical) { /* take the canonical kmer */ - auto kmer_rc = kmer; - kmer_rc.reverse_complement_inplace(k); - kmer = std::min(kmer, kmer_rc); - } - kmers.push_back(kmer); - positions_in_bucket.push_back(pos_in_bucket); - it.next(); + if (build_config.verbose) { + std::cout << " built positions[" << partition_id << "] for " << P.size() + << " kmers; bits/key = " << (P.num_bytes() * 8.0) / P.size() + << std::endl; } - assert(pos_in_bucket < (1ULL << cvb_positions.width())); + } + + /* advance partition state for the next iteration */ + lower = upper; + upper = 2 * lower; + num_bits_per_pos += 1; + if (partition_id + 1 == num_partitions - 1) { + upper = max_bucket_size; + num_bits_per_pos = log2_max_bucket_size; } } - assert(partition_id == num_partitions - 1); d.m_ssi.ski.mphfs = std::move(mphfs); d.m_ssi.ski.positions = std::move(positions); From 63ea2bdae878f0aa42c1bab7a861d1775d4abcc1 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 20:48:42 +0000 Subject: [PATCH 04/32] strings: stream to disk during build, read via small windows The strings bit-vector is the largest in-RAM structure during construction. This commit moves it onto disk for the build phase, replacing bits::bit_vector::builder with a new disk_backed_strings storage: - append_bits() during step 1 (encode_strings) writes completed words to a tmp file, keeping only a small write window in RAM (~512 KiB by default). - After freeze(), make_reader() returns a forward-monotonic reader over the file with a small read window (~512 KiB). Each reader owns its own ifstream so multiple threads can read concurrently without contention on the writer. - The reader exposes get_word64(pos) const, matching the interface that kmer_iterator expects. Wiring: - compute_minimizer_tuples (step 2): each thread instantiates its own reader; per-thread RSS contribution is bounded by its window size, not by the strings size. - build_sparse_and_skew_index (step 7.1): the in-RAM d.m_spss.strings is no longer populated here; step 7.2 phase (B) reads via a reader instead. - A new step 8 materializes d.m_spss.strings from the on-disk file, immediately before the standard essentials::save path. This brings strings briefly back into RAM at the very end of the build; eliminating that final peak requires a streaming save path, which is a separate change. Cleanup: the dictionary_builder destructor removes the strings tmp file if it still exists (covers exception paths). The hash in step 5's RAM_taken_in_bytes calculation no longer counts strings (its in-RAM footprint is now just the writer window). Verified via `sshash build --check` on: - salmonella_enterica m=7 (heavy skew index, regular and canonical), - salmonella_100 m=11 -t 4 (multi-thread, 13M kmers). Save -> load -> query roundtrip on the same dataset matches 100% on the canonical query set. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- include/builder/dictionary_builder.hpp | 35 ++- include/builder/disk_backed_strings.hpp | 282 ++++++++++++++++++++ src/builder/build_sparse_and_skew_index.cpp | 7 +- src/builder/compute_minimizer_tuples.cpp | 3 +- 4 files changed, 320 insertions(+), 7 deletions(-) create mode 100644 include/builder/disk_backed_strings.hpp diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp index dea89a5..77e2c17 100644 --- a/include/builder/dictionary_builder.hpp +++ b/include/builder/dictionary_builder.hpp @@ -4,6 +4,7 @@ #include "include/dictionary.hpp" #include "include/offsets.hpp" #include "include/builder/util.hpp" +#include "include/builder/disk_backed_strings.hpp" #include "include/buckets_statistics.hpp" namespace sshash { @@ -12,7 +13,13 @@ template struct dictionary_builder // { dictionary_builder(build_configuration const& build_config) - : build_config(build_config), num_kmers(0), minimizers(build_config), total_time_musec(0) {} + : build_config(build_config) + , num_kmers(0) + , minimizers(build_config) + , strings_run_id(pthash::clock_type::now().time_since_epoch().count()) + , total_time_musec(0) {} + + ~dictionary_builder() { strings_builder.remove_file(); } void build(dictionary& d, std::string const& filename) // { @@ -32,8 +39,16 @@ struct dictionary_builder // total_time_musec = 0; + { + std::stringstream ss; + ss << build_config.tmp_dirname << "/sshash.tmp.run_" << strings_run_id + << ".strings.bin"; + strings_builder.open_for_writing(ss.str()); + } + do_step("step 1 (encode strings)", [&]() { encode_strings(filename); + strings_builder.freeze(); d.m_num_kmers = num_kmers; assert(strings_offsets_builder.size() >= 2); d.m_num_strings = strings_offsets_builder.size() - 1; @@ -66,6 +81,14 @@ struct dictionary_builder // assert(strings_offsets_builder.size() == 0); }); + /* The build above keeps `strings` exclusively on disk (accessed via + `disk_backed_strings::reader` windows). Materialize the in-RAM + bit_vector now for the standard `essentials::save` path. */ + do_step("step 8 (materialize strings to RAM)", [&]() { + strings_builder.load_into(d.m_spss.strings); + strings_builder.remove_file(); + }); + if (build_config.verbose) { print_time(total_time_musec, "total time"); d.print_space_breakdown(); @@ -82,9 +105,11 @@ struct dictionary_builder // uint64_t num_kmers; minimizers_tuples minimizers; typename Offsets::builder strings_offsets_builder; - bits::bit_vector::builder strings_builder; + disk_backed_strings strings_builder; weights::builder weights_builder; + uint64_t strings_run_id; + essentials::timer_type timer; essentials::json_lines build_stats; uint64_t total_time_musec; @@ -134,8 +159,10 @@ struct dictionary_builder // uint64_t RAM_available_in_bytes = essentials::GiB / 2; // at least 0.5 GB { - const uint64_t RAM_taken_in_bytes = (f.num_bits() + strings_builder.num_bits()) / 8 + - strings_offsets_builder.num_bytes(); + /* `strings_builder` is now disk-backed; its in-RAM footprint is + bounded by its window size, not by the strings size. */ + const uint64_t RAM_taken_in_bytes = + f.num_bits() / 8 + strings_offsets_builder.num_bytes(); const uint64_t RAM_limit_in_bytes = build_config.ram_limit_in_GiB * essentials::GiB; if (RAM_limit_in_bytes > RAM_taken_in_bytes) { RAM_available_in_bytes = std::max(RAM_limit_in_bytes - RAM_taken_in_bytes, diff --git a/include/builder/disk_backed_strings.hpp b/include/builder/disk_backed_strings.hpp new file mode 100644 index 0000000..7ec7e46 --- /dev/null +++ b/include/builder/disk_backed_strings.hpp @@ -0,0 +1,282 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "external/pthash/external/bits/include/bit_vector.hpp" + +namespace sshash { + +/* + Disk-backed storage for the SSHash `strings` bit-vector. + + During step 1 (encode_strings) bits are appended via `append_bits`. + Internally only the trailing words are kept in RAM (a small "write + window"); completed words are flushed to a tmp file. RAM usage of the + writer is bounded by the window size, independently of the total + bit-vector size. + + After `freeze()`, callers create one or more `reader`s. Each reader owns + an ifstream and a small in-RAM read window, and supports + forward-monotonic `get_word64(bit_pos)` reads. The reader matches the + interface that `kmer_iterator` expects. + + For the standard dictionary save path, `load_into(bits::bit_vector&)` + materializes the full bit-vector in RAM (peaks briefly at strings size). +*/ +struct disk_backed_strings { + static constexpr uint64_t default_writer_buffer_words = uint64_t(1) << 16; // 512 KiB + static constexpr uint64_t default_reader_window_words = uint64_t(1) << 16; // 512 KiB + + disk_backed_strings() + : m_num_bits(0) + , m_writer_buffer_words(default_writer_buffer_words) + , m_words_on_disk(0) + , m_frozen(false) {} + + disk_backed_strings(disk_backed_strings const&) = delete; + disk_backed_strings& operator=(disk_backed_strings const&) = delete; + + /* Open `filename` for writing; truncates any existing contents. */ + void open_for_writing(std::string const& filename, + uint64_t writer_buffer_words = default_writer_buffer_words) { + m_filename = filename; + m_writer_buffer_words = std::max(2, writer_buffer_words); + m_num_bits = 0; + m_words_on_disk = 0; + m_frozen = false; + m_buf.clear(); + m_buf.reserve(m_writer_buffer_words); + m_writer.open(m_filename, std::ofstream::binary | std::ofstream::trunc); + if (!m_writer.is_open()) { + throw std::runtime_error("cannot open strings tmp file '" + m_filename + "'"); + } + } + + /* No-op: kept for source-compatibility with bits::bit_vector::builder. */ + void reserve(uint64_t /*num_bits*/) {} + + /* Append `len` bits (`len` <= 64) from `bits`. Same semantics as + bits::bit_vector::builder::append_bits. */ + void append_bits(uint64_t bits, uint64_t len) { + assert(len <= 64); + assert(len == 64 || (bits >> len) == 0); + if (!len) return; + const uint64_t pos_in_word = m_num_bits & 63; + m_num_bits += len; + if (pos_in_word == 0) { + m_buf.push_back(bits); + } else { + m_buf.back() |= bits << pos_in_word; + if (len > 64 - pos_in_word) m_buf.push_back(bits >> (64 - pos_in_word)); + } + if (m_buf.size() > m_writer_buffer_words) flush_completed_words(); + } + + /* Flush any remaining buffered words and close the writer. After this, + the file is ready for `make_reader()` and `load_into()`. */ + void freeze() { + if (m_frozen) return; + if (!m_buf.empty()) { + m_writer.write(reinterpret_cast(m_buf.data()), + m_buf.size() * sizeof(uint64_t)); + m_words_on_disk += m_buf.size(); + m_buf.clear(); + m_buf.shrink_to_fit(); + } + m_writer.close(); + m_frozen = true; + } + + uint64_t num_bits() const { return m_num_bits; } + std::string const& filename() const { return m_filename; } + bool frozen() const { return m_frozen; } + + /* + Forward-monotonic reader over the strings file. + + `get_word64(bit_pos)` returns the 64-bit word starting at bit + position `bit_pos`. Successive calls must satisfy a forward-monotonic + access pattern in word units (calling code may seek forward via + `at()`-style calls in `kmer_iterator`, but never backward). Reads + past the end-of-file are returned as zero (matches the sentinel + zero-padding the SSHash builder writes at the tail of `strings`). + */ + struct reader { + reader() = default; + reader(reader&& other) noexcept { move_from(std::move(other)); } + reader& operator=(reader&& other) noexcept { + if (this != &other) { + close(); + move_from(std::move(other)); + } + return *this; + } + reader(reader const&) = delete; + reader& operator=(reader const&) = delete; + ~reader() { close(); } + + void open(std::string const& filename, uint64_t num_bits, + uint64_t window_capacity_words = default_reader_window_words) { + m_num_bits = num_bits; + m_total_words = (num_bits + 63) / 64; + m_window_capacity = std::max(2, window_capacity_words); + m_window.assign(m_window_capacity, 0); + m_window_size = 0; + m_window_start_word = 0; + m_in.open(filename, std::ifstream::binary); + if (!m_in.is_open()) { + throw std::runtime_error("cannot open strings tmp file '" + filename + "'"); + } + seek_window_to(0); + } + + bool is_open() const { return m_in.is_open(); } + + void close() { + if (m_in.is_open()) m_in.close(); + m_window.clear(); + m_window.shrink_to_fit(); + m_window_size = 0; + m_window_start_word = 0; + } + + uint64_t num_bits() const { return m_num_bits; } + + uint64_t get_word64(uint64_t bit_pos) const { + const uint64_t block = bit_pos >> 6; + const uint64_t shift = bit_pos & 63; + ensure_window_covers(block); + uint64_t a = (block >= m_window_start_word && + block < m_window_start_word + m_window_size) + ? m_window[block - m_window_start_word] + : uint64_t(0); + uint64_t word = a >> shift; + if (shift) { + const uint64_t next = block + 1; + uint64_t b = (next >= m_window_start_word && + next < m_window_start_word + m_window_size) + ? m_window[next - m_window_start_word] + : uint64_t(0); + word |= b << (64 - shift); + } + return word; + } + + private: + mutable std::ifstream m_in; + uint64_t m_num_bits = 0; + uint64_t m_total_words = 0; + mutable std::vector m_window; + uint64_t m_window_capacity = 0; + mutable uint64_t m_window_size = 0; + mutable uint64_t m_window_start_word = 0; + + void seek_window_to(uint64_t target_word) const { + m_window_start_word = target_word; + if (target_word >= m_total_words) { + m_window_size = 0; + return; + } + m_in.clear(); // clear any prior eof + m_in.seekg(static_cast(target_word * sizeof(uint64_t)), + std::ios::beg); + const uint64_t to_read = std::min(m_window_capacity, m_total_words - target_word); + m_in.read(reinterpret_cast(m_window.data()), + static_cast(to_read * sizeof(uint64_t))); + const std::streamsize nread = m_in.gcount(); + m_window_size = static_cast(nread) / sizeof(uint64_t); + } + + void ensure_window_covers(uint64_t block) const { + // We may need both `block` and `block + 1` (for cross-word shifts). + // The window covers [m_window_start_word, m_window_start_word + m_window_size). + const uint64_t need_end = block + 2; // exclusive + if (block >= m_window_start_word && need_end <= m_window_start_word + m_window_size) { + return; + } + // Slide forward (backward seeks are not supported). + seek_window_to(block); + } + + void move_from(reader&& other) { + m_in = std::move(other.m_in); + m_num_bits = other.m_num_bits; + m_total_words = other.m_total_words; + m_window = std::move(other.m_window); + m_window_capacity = other.m_window_capacity; + m_window_size = other.m_window_size; + m_window_start_word = other.m_window_start_word; + other.m_num_bits = 0; + other.m_total_words = 0; + other.m_window_capacity = 0; + other.m_window_size = 0; + other.m_window_start_word = 0; + } + }; + + /* Create a new reader over the frozen file. */ + reader make_reader(uint64_t window_capacity_words = default_reader_window_words) const { + if (!m_frozen) { + throw std::runtime_error("disk_backed_strings: must freeze() before make_reader()"); + } + reader r; + r.open(m_filename, m_num_bits, window_capacity_words); + return r; + } + + /* + Materialize the full bit-vector in RAM. This briefly peaks at the + bit-vector size and is used immediately before `essentials::save`. + */ + void load_into(bits::bit_vector& bv) const { + if (!m_frozen) { + throw std::runtime_error("disk_backed_strings: must freeze() before load_into()"); + } + bits::bit_vector::builder b(m_num_bits); + auto& data_vec = b.data(); + const uint64_t total_words = (m_num_bits + 63) / 64; + if (total_words > 0) { + std::ifstream in(m_filename, std::ifstream::binary); + if (!in.is_open()) { + throw std::runtime_error("cannot open strings tmp file '" + m_filename + "'"); + } + in.read(reinterpret_cast(data_vec.data()), + static_cast(total_words * sizeof(uint64_t))); + in.close(); + } + b.build(bv); + } + + /* Delete the on-disk strings file. */ + void remove_file() { + if (!m_filename.empty()) std::remove(m_filename.c_str()); + } + +private: + std::string m_filename; + std::ofstream m_writer; + uint64_t m_num_bits; + std::vector m_buf; + uint64_t m_writer_buffer_words; + uint64_t m_words_on_disk; + bool m_frozen; + + void flush_completed_words() { + if (m_buf.size() < 2) return; + const uint64_t to_flush = m_buf.size() - 1; // keep last (possibly partial) word + m_writer.write(reinterpret_cast(m_buf.data()), + static_cast(to_flush * sizeof(uint64_t))); + m_words_on_disk += to_flush; + m_buf[0] = m_buf.back(); + m_buf.resize(1); + } +}; + +} // namespace sshash diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp index 1720274..9d8551b 100644 --- a/src/builder/build_sparse_and_skew_index.cpp +++ b/src/builder/build_sparse_and_skew_index.cpp @@ -110,7 +110,9 @@ void dictionary_builder::build_sparse_and_skew_index( control_codewords_builder.resize(num_minimizers, num_bits_for_control); strings_offsets_builder.build(d.m_spss.strings_offsets); - strings_builder.build(d.m_spss.strings); + /* `d.m_spss.strings` is materialized later, in step 8, from the on-disk + strings tmp file owned by `strings_builder`. Step 7.2 phase (B) reads + directly from the file via a `disk_backed_strings::reader` window. */ /* step 1. build sparse index */ assert(buckets_stats.num_buckets() == num_minimizers); @@ -451,7 +453,8 @@ void dictionary_builder::build_sparse_and_skew_index( const uint64_t k = build_config.k; const bool canonical = build_config.canonical; - kmer_iterator kmer_it(d.m_spss.strings, k); + auto strings_reader = strings_builder.make_reader(); + kmer_iterator kmer_it(strings_reader, k); while (merger.has_next()) // { diff --git a/src/builder/compute_minimizer_tuples.cpp b/src/builder/compute_minimizer_tuples.cpp index 94916d6..8458857 100644 --- a/src/builder/compute_minimizer_tuples.cpp +++ b/src/builder/compute_minimizer_tuples.cpp @@ -47,7 +47,8 @@ void dictionary_builder::compute_minimizer_tuples() // const uint64_t index_end = std::min(index_begin + num_sequences_per_thread, num_sequences); - kmer_iterator kmer_it(strings_builder, k); + auto strings_reader = strings_builder.make_reader(); + kmer_iterator kmer_it(strings_reader, k); hasher_type hasher(build_config.seed); minimizer_iterator minimizer_it(k, m, hasher); minimizer_iterator_rc minimizer_it_rc(k, m, hasher); From 54e98ebe0f52a439965686bd5ed4f05d04e721ab Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 21:01:32 +0000 Subject: [PATCH 05/32] streaming dictionary save (no full strings in RAM at any point) Introduces a streaming-save build path that writes the entire dictionary to disk without ever materializing the strings bit-vector in RAM. Combined with the disk-backed strings storage from the previous commit, this means the strings live exclusively on disk for the lifetime of the build, eliminating the final RAM peak that step 8 previously reintroduced just before essentials::save. Mechanism: - disk_backed_strings::save_to(ostream&) emits the same byte layout as bits::bit_vector's serialization (uint64_t m_num_bits, size_t n, then n*8 bytes), reading the words from the tmp file in 64 KiB chunks. - streaming_strings_saver wraps essentials::generic_saver and overrides visit() for bits::bit_vector: when the visited instance matches a known address (d.m_spss.strings), the streaming serializer is invoked; everything else goes through the normal essentials path. Address matching avoids introducing a marker type into bits::bit_vector. API: - dictionary_builder::build() (existing) materializes strings in RAM at the end so the dictionary is query-ready (--check, etc.). - dictionary_builder::build_streaming_save() runs steps 1-7 and stream-saves directly to the output file, leaving d.m_spss.strings empty. The dictionary is *not* query-ready after this; reload from disk to query. - dictionary::build_streaming_save() exposes the new flow. tools/build.cpp uses the streaming-save path automatically when -o is given without --check; otherwise it falls back to the materializing path so --check can run queries against d. Verified by building the same input via both paths and diff'ing the output file: byte-identical for regular, --canonical, --weighted, and 4-thread builds. Loading the streaming-saved file via essentials::load and running the standalone `sshash check` and `sshash query` tools returns "EVERYTHING OK!" and 100% positive matches. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- include/builder/dictionary_builder.hpp | 78 ++++++++++++------ include/builder/disk_backed_strings.hpp | 46 +++++++++++ include/builder/streaming_save.hpp | 101 ++++++++++++++++++++++++ include/dictionary.hpp | 13 ++- src/builder/build.cpp | 43 +++++++--- tools/build.cpp | 43 ++++++---- 6 files changed, 271 insertions(+), 53 deletions(-) create mode 100644 include/builder/streaming_save.hpp diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp index 77e2c17..3c4c5fe 100644 --- a/include/builder/dictionary_builder.hpp +++ b/include/builder/dictionary_builder.hpp @@ -5,6 +5,7 @@ #include "include/offsets.hpp" #include "include/builder/util.hpp" #include "include/builder/disk_backed_strings.hpp" +#include "include/builder/streaming_save.hpp" #include "include/buckets_statistics.hpp" namespace sshash { @@ -21,8 +22,55 @@ struct dictionary_builder // ~dictionary_builder() { strings_builder.remove_file(); } - void build(dictionary& d, std::string const& filename) // + /* + Build a query-ready dictionary in `d`. After this returns, + `d.m_spss.strings` is materialized in RAM (peak briefly equals the + strings size). Use this when the caller needs to query `d` post-build + (e.g., `--check`). + */ + void build(dictionary& d, std::string const& filename) { + run_steps_1_through_7(d, filename); + do_step("step 8 (materialize strings to RAM)", [&]() { + strings_builder.load_into(d.m_spss.strings); + strings_builder.remove_file(); + }); + finalize_stats(d); + } + + /* + Build the dictionary and stream-save it to `output_filename` without + ever materializing `strings` in RAM. After this returns, `d` is *not* + query-ready (`d.m_spss.strings` is empty). Use this when the caller + only needs the on-disk index file and wants to keep peak RAM bounded + by the build phase. + */ + void build_streaming_save(dictionary& d, // + std::string const& filename, // + std::string const& output_filename) // { + run_steps_1_through_7(d, filename); + do_step("step 8 (stream-save dictionary to disk)", [&]() { + save_streaming(d, output_filename.c_str(), &d.m_spss.strings, strings_builder); + strings_builder.remove_file(); + }); + finalize_stats(d); + } + + build_configuration build_config; + uint64_t num_kmers; + minimizers_tuples minimizers; + typename Offsets::builder strings_offsets_builder; + disk_backed_strings strings_builder; + weights::builder weights_builder; + + uint64_t strings_run_id; + + essentials::timer_type timer; + essentials::json_lines build_stats; + uint64_t total_time_musec; + +private: + void run_steps_1_through_7(dictionary& d, std::string const& filename) { d.m_k = build_config.k; d.m_m = build_config.m; d.m_spss.k = build_config.k; @@ -80,18 +128,14 @@ struct dictionary_builder // minimizers.remove_tmp_file(); assert(strings_offsets_builder.size() == 0); }); + } - /* The build above keeps `strings` exclusively on disk (accessed via - `disk_backed_strings::reader` windows). Materialize the in-RAM - bit_vector now for the standard `essentials::save` path. */ - do_step("step 8 (materialize strings to RAM)", [&]() { - strings_builder.load_into(d.m_spss.strings); - strings_builder.remove_file(); - }); - + void finalize_stats(dictionary& d) { if (build_config.verbose) { print_time(total_time_musec, "total time"); - d.print_space_breakdown(); + /* `print_space_breakdown` reads d.m_spss.strings; only safe in + the materialize-to-RAM flow. */ + if (d.m_spss.strings.num_bits() > 0) d.print_space_breakdown(); } build_stats.add("total_build_time_in_microsec", total_time_musec); @@ -101,20 +145,6 @@ struct dictionary_builder // if (build_config.verbose) build_stats.print(); } - build_configuration build_config; - uint64_t num_kmers; - minimizers_tuples minimizers; - typename Offsets::builder strings_offsets_builder; - disk_backed_strings strings_builder; - weights::builder weights_builder; - - uint64_t strings_run_id; - - essentials::timer_type timer; - essentials::json_lines build_stats; - uint64_t total_time_musec; - -private: void print_time(double time_in_musec, std::string const& message) { std::cout << "=== " << message << ": " << time_in_musec / 1'000'000 << " [sec] (" << (time_in_musec * 1000) / num_kmers << " [ns/kmer])" << std::endl; diff --git a/include/builder/disk_backed_strings.hpp b/include/builder/disk_backed_strings.hpp index 7ec7e46..dabafe9 100644 --- a/include/builder/disk_backed_strings.hpp +++ b/include/builder/disk_backed_strings.hpp @@ -231,6 +231,52 @@ struct disk_backed_strings { return r; } + /* + Stream the strings to `os` in the same byte format that + `essentials::generic_saver::visit(bits::bit_vector const&)` would + produce — i.e., + uint64_t m_num_bits; + size_t n; // number of 64-bit words + uint64_t m_data[n]; + — without ever materializing the full bit-vector in RAM. The bytes + are read from the tmp file in fixed-size chunks. + + This relies on `bits::bit_vector::visit_impl` writing exactly two + fields (`m_num_bits` and the `m_data` owning_span) and on + `generic_saver::visit_seq` writing `size_t n` followed by the raw + `n * sizeof(uint64_t)` bytes. If `bits::bit_vector` ever changes its + on-disk representation, this method must be updated to match. + */ + void save_to(std::ostream& os) const { + if (!m_frozen) { + throw std::runtime_error("disk_backed_strings: must freeze() before save_to()"); + } + const uint64_t num_bits = m_num_bits; + os.write(reinterpret_cast(&num_bits), sizeof(uint64_t)); + const uint64_t total_words = (num_bits + 63) / 64; + const std::size_t n = static_cast(total_words); + os.write(reinterpret_cast(&n), sizeof(std::size_t)); + if (total_words == 0) return; + std::ifstream in(m_filename, std::ifstream::binary); + if (!in.is_open()) { + throw std::runtime_error("cannot open strings tmp file '" + m_filename + "'"); + } + std::vector buffer(uint64_t(64) << 10); // 64 KiB + uint64_t bytes_remaining = total_words * sizeof(uint64_t); + while (bytes_remaining > 0) { + const std::streamsize chunk = static_cast( + std::min(buffer.size(), bytes_remaining)); + in.read(buffer.data(), chunk); + const std::streamsize got = in.gcount(); + if (got <= 0) { + throw std::runtime_error("unexpected EOF in strings tmp file '" + m_filename + "'"); + } + os.write(buffer.data(), got); + bytes_remaining -= static_cast(got); + } + in.close(); + } + /* Materialize the full bit-vector in RAM. This briefly peaks at the bit-vector size and is used immediately before `essentials::save`. diff --git a/include/builder/streaming_save.hpp b/include/builder/streaming_save.hpp new file mode 100644 index 0000000..73e4315 --- /dev/null +++ b/include/builder/streaming_save.hpp @@ -0,0 +1,101 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "essentials.hpp" +#include "external/pthash/external/bits/include/bit_vector.hpp" + +#include "include/builder/disk_backed_strings.hpp" + +namespace sshash { + +/* + A saver that mirrors `essentials::generic_saver`, except that any visit + to a specific `bits::bit_vector` instance (identified by address) is + redirected to `disk_backed_strings::save_to`, which streams the strings + bytes from the on-disk tmp file. All other visits go through the regular + `essentials` path. + + Using address-based identification means we don't need to add any + intermediate type or marker to `bits::bit_vector` itself. +*/ +struct streaming_strings_saver { + streaming_strings_saver(std::ostream& os, // + bits::bit_vector const* strings_addr, // + disk_backed_strings const* strings_storage) // + : m_os(os), m_strings_addr(strings_addr), m_strings_storage(strings_storage) { + if (m_strings_addr == nullptr || m_strings_storage == nullptr) { + throw std::runtime_error("streaming_strings_saver requires non-null arguments"); + } + } + + template + void visit(T const& val) { + if constexpr (std::is_same_v) { + if (&val == m_strings_addr) { + m_strings_storage->save_to(m_os); + return; + } + } + if constexpr (essentials::is_pod::value) { + essentials::save_pod(m_os, val); + } else { + val.visit(*this); + } + } + + template + void visit(std::vector const& vec) { + visit_seq(vec); + } + + template + void visit(essentials::owning_span const& vec) { + visit_seq(vec); + } + + std::size_t bytes() { return static_cast(m_os.tellp()); } + +private: + std::ostream& m_os; + bits::bit_vector const* m_strings_addr; + disk_backed_strings const* m_strings_storage; + + template + void visit_seq(Vec const& vec) { + using T = typename Vec::value_type; + const std::size_t n = vec.size(); + visit(n); + if constexpr (essentials::is_pod::value) { + m_os.write(reinterpret_cast(vec.data()), + static_cast(sizeof(T) * n)); + } else { + for (auto const& v : vec) visit(v); + } + } +}; + +/* + Save `t` to `filename`, streaming any embedded `bits::bit_vector` whose + address matches `strings_addr` from `strings_storage` instead of from + RAM. Other fields are saved using the standard `essentials` path. +*/ +template +void save_streaming(T const& t, char const* filename, // + bits::bit_vector const* strings_addr, // + disk_backed_strings const& strings_storage) // +{ + std::ofstream out(filename, std::ios::binary); + if (!out.good()) { + throw std::runtime_error(std::string("error opening file '") + filename + "' for writing"); + } + streaming_strings_saver saver(out, strings_addr, &strings_storage); + saver.visit(t); + out.close(); +} + +} // namespace sshash diff --git a/include/dictionary.hpp b/include/dictionary.hpp index a30b8c4..7790efb 100644 --- a/include/dictionary.hpp +++ b/include/dictionary.hpp @@ -25,9 +25,20 @@ struct dictionary // , m_m(0) , m_canonical(false) {} - /* Build from input file. */ + /* Build from input file. After this returns, `*this` is query-ready. */ void build(std::string const& input_filename, build_configuration const& build_config); + /* + Build from input file and stream-save the resulting dictionary to + `output_filename`. The strings bit-vector is never materialized in + RAM during construction, so peak RAM is bounded by the build phase + only. After this returns, `*this` is *not* query-ready + (`m_spss.strings` is empty); reload via `essentials::load` to query. + */ + void build_streaming_save(std::string const& input_filename, + build_configuration const& build_config, + std::string const& output_filename); + essentials::version_number vnum() const { return m_vnum; } uint64_t num_kmers() const { return m_num_kmers; } uint64_t num_strings() const { return m_num_strings; } diff --git a/src/builder/build.cpp b/src/builder/build.cpp index e9eed1d..76d0b97 100644 --- a/src/builder/build.cpp +++ b/src/builder/build.cpp @@ -6,25 +6,42 @@ namespace sshash { +namespace { + +inline void validate_build_config_or_throw(build_configuration const& bc, uint64_t max_k, + uint64_t max_m) { + if (bc.k == 0) throw std::runtime_error("k must be > 0"); + if (bc.k > max_k) { + throw std::runtime_error("k must be less <= " + std::to_string(max_k) + + " but got k = " + std::to_string(bc.k)); + } + if (bc.m == 0) throw std::runtime_error("m must be > 0"); + if (bc.m > max_m) { + throw std::runtime_error("m must be less <= " + std::to_string(max_m) + + " but got m = " + std::to_string(bc.m)); + } + if (bc.m > bc.k) throw std::runtime_error("m must be <= k"); +} + +} // namespace + template void dictionary::build(std::string const& filename, build_configuration const& build_config) // { - /* Validate the build configuration. */ - if (build_config.k == 0) throw std::runtime_error("k must be > 0"); - if (build_config.k > Kmer::max_k) { - throw std::runtime_error("k must be less <= " + std::to_string(Kmer::max_k) + - " but got k = " + std::to_string(build_config.k)); - } - if (build_config.m == 0) throw std::runtime_error("m must be > 0"); - if (build_config.m > Kmer::max_m) { - throw std::runtime_error("m must be less <= " + std::to_string(Kmer::max_m) + - " but got m = " + std::to_string(build_config.m)); - } - if (build_config.m > build_config.k) throw std::runtime_error("m must be <= k"); - + validate_build_config_or_throw(build_config, Kmer::max_k, Kmer::max_m); dictionary_builder builder(build_config); builder.build(*this, filename); } +template +void dictionary::build_streaming_save( + std::string const& input_filename, build_configuration const& build_config, + std::string const& output_filename) // +{ + validate_build_config_or_throw(build_config, Kmer::max_k, Kmer::max_m); + dictionary_builder builder(build_config); + builder.build_streaming_save(*this, input_filename, output_filename); +} + } // namespace sshash diff --git a/tools/build.cpp b/tools/build.cpp index 6630386..fbc7d1d 100644 --- a/tools/build.cpp +++ b/tools/build.cpp @@ -73,25 +73,38 @@ int build(int argc, char** argv) { // build_config.print(); - essentials::logger("building data structure..."); - dictionary_type dict; - dict.build(input_filename, build_config); - bool check = parser.get("check"); - if (check) { - check_correctness_lookup_access(dict, input_filename); - check_correctness_navigational_kmer_query(dict, input_filename); - check_correctness_navigational_string_query(dict); - if (build_config.weighted) check_correctness_weights(dict, input_filename); - check_correctness_kmer_iterator(dict); - check_correctness_string_iterator(dict); - } + bool has_output = parser.parsed("output_filename"); - if (parser.parsed("output_filename")) { + dictionary_type dict; + + if (has_output && !check) { + /* Streaming-save path: keeps peak RAM bounded by the build phase + (the strings bit-vector is never fully in RAM). After this returns + `dict` is not query-ready; reload from disk to query. */ auto output_filename = parser.get("output_filename"); - essentials::logger("saving data structure to disk..."); - essentials::save(dict, output_filename.c_str()); + essentials::logger("building data structure (streaming save)..."); + dict.build_streaming_save(input_filename, build_config, output_filename); essentials::logger("DONE"); + } else { + essentials::logger("building data structure..."); + dict.build(input_filename, build_config); + + if (check) { + check_correctness_lookup_access(dict, input_filename); + check_correctness_navigational_kmer_query(dict, input_filename); + check_correctness_navigational_string_query(dict); + if (build_config.weighted) check_correctness_weights(dict, input_filename); + check_correctness_kmer_iterator(dict); + check_correctness_string_iterator(dict); + } + + if (has_output) { + auto output_filename = parser.get("output_filename"); + essentials::logger("saving data structure to disk..."); + essentials::save(dict, output_filename.c_str()); + essentials::logger("DONE"); + } } return 0; From e5d26127427459a1f0cf167b62553668d3c9f43c Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 21:20:08 +0000 Subject: [PATCH 06/32] step 7.1: drop redundant tuples copy; point bucket_type into mmap Previously step 7.1 copied every minimizer tuple of every non-singleton bucket into an in-RAM vector `tuples`, just to give bucket_type a contiguous backing store. The mmap'd `input` already provides exactly that, so the copy was pure overhead (~18 B per super-kmer in non-singleton buckets, scaling with the input). Now bucket_type stores raw minimizer_tuple pointers into input.data() directly, and `input` is kept open through step 7.2 phase (A) (which is the last consumer). After phase (A) both `buckets` and `input` are released. Verified byte-identical output vs the previous commit on salmonella_enterica m=7, plus full --check on regular, --canonical, multi-thread (-t 4), and --weighted builds. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- src/builder/build_sparse_and_skew_index.cpp | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp index 9d8551b..891a61b 100644 --- a/src/builder/build_sparse_and_skew_index.cpp +++ b/src/builder/build_sparse_and_skew_index.cpp @@ -137,10 +137,9 @@ void dictionary_builder::build_sparse_and_skew_index( std::vector buckets; buckets.reserve(num_buckets_larger_than_1_not_in_skew_index + num_buckets_in_skew_index); - std::vector tuples; // backed memory - tuples.reserve(num_super_kmers_in_buckets_larger_than_1); - // Second pass: collect buckets > 1 for sorting AND handle size-1 buckets + /* Second pass: register buckets > 1 (pointing directly into the mmap'd + `input`, no copy) and handle size-1 buckets inline. */ for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); // it.has_next(); it.next()) // { @@ -163,18 +162,14 @@ void dictionary_builder::build_sparse_and_skew_index( } } } else { - // Collect buckets > 1 for later processing - minimizer_tuple const* begin = tuples.data() + tuples.size(); - std::copy(bucket.begin_ptr(), bucket.end_ptr(), std::back_inserter(tuples)); - minimizer_tuple const* end = tuples.data() + tuples.size(); - buckets.push_back(bucket_type(begin, end)); + /* Buckets > 1: store pointers directly into the mmap'd `input`. + `input` is kept open through step 7.2 phase (A). */ + buckets.push_back(bucket_type(bucket.begin_ptr(), bucket.end_ptr())); } } assert(buckets.size() == num_buckets_larger_than_1_not_in_skew_index + num_buckets_in_skew_index); - input.close(); - std::sort(buckets.begin(), buckets.end(), [](bucket_type const& x, bucket_type const& y) { return x.size() < y.size(); }); @@ -408,6 +403,12 @@ void dictionary_builder::build_sparse_and_skew_index( assert(partition_id == num_partitions - 1); } + /* `buckets` and the mmap'd `input` are no longer needed: phase (B) walks + the sorted requests and per-partition tmp files, phase (C) walks the + per-partition tmp files. Free both now to bound RAM. */ + std::vector().swap(buckets); + input.close(); + if (build_config.verbose) { uint64_t total_kmers_in_skew = 0; for (uint64_t p = 0; p != num_partitions; ++p) { From 0301201892764e5501e847ba402675fe6197fb3e Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 21:29:44 +0000 Subject: [PATCH 07/32] step 7.2 phase C: stream per-partition kmers from disk; external-memory MPHF Phase C used to materialize the full per-skew-partition working set in RAM: - kmers vector (~16 B/kmer, default uint64_t-backed kmer) - positions_in_bucket vector (~4 B/kmer) - cvb_positions builder (~num_bits_per_pos / 8 B/kmer, the actual stored output) For a partition with N kmers this peaked at ~21 N bytes (e.g. 20 GB for a 1 B-kmer partition). The kmers and positions_in_bucket vectors were redundant in-RAM copies of data already on disk in the per-partition tmp file written by phase (B). This commit replaces them with two streaming passes over the tmp file: (1) MPHF build via pthash's `build_in_external_memory` driven by a small forward iterator (`skew_partition_kmer_iterator`) that reads `(kmer.bits, pos_in_bucket)` records via a shared_ptr. pthash spills hashes to `tmp_dirname` under a `--ram-limit / 2` RAM budget rather than holding all keys + hashes simultaneously. (2) A second sequential pass over the same tmp file fills cvb_positions: for each `(kmer, pib)` record it sets cvb_positions[F(kmer)] = pib. Only cvb_positions itself stays resident through both passes, and it's the actual stored output (not a transient). The iterator must be copyable because pthash's `build_in_external_memory` takes the iterator by value; the shared_ptr means copies share the underlying stream state. After the build call returns the original at the call site is unused, so the shared advancement is harmless. Verified byte-identical output vs the previous commit on salmonella_enterica m=7, plus full --check on regular, --canonical, multi-thread (-t 4), and --weighted builds. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- src/builder/build_sparse_and_skew_index.cpp | 135 ++++++++++++++------ 1 file changed, 99 insertions(+), 36 deletions(-) diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp index 891a61b..0a4a3f8 100644 --- a/src/builder/build_sparse_and_skew_index.cpp +++ b/src/builder/build_sparse_and_skew_index.cpp @@ -36,6 +36,62 @@ struct kmer_extraction_request { }; #pragma pack(pop) +/* + Forward iterator over a per-skew-partition tmp file produced by step + 7.2 phase (B). Each record is `(kmer.bits, uint32_t pos_in_bucket)`. + This iterator yields successive Kmer values, exposing the minimal + interface (`*it`, `++it`) that pthash's external-memory partitioned PHF + builder consumes. + + pthash takes the iterator by value, so it must be copyable. The + underlying `ifstream` is held via `shared_ptr` and shared between + copies; pthash's copy advances the shared stream state, which is fine + because the original at the call site is no longer used after the + build call returns. +*/ +template +struct skew_partition_kmer_iterator { + using iterator_category = std::forward_iterator_tag; + using value_type = Kmer; + using difference_type = std::ptrdiff_t; + using reference = Kmer const&; + using pointer = Kmer const*; + + skew_partition_kmer_iterator() = default; + + void open(std::string const& filename) { + m_in = std::make_shared(filename, std::ifstream::binary); + if (!m_in->is_open()) { + throw std::runtime_error("cannot open skew-partition tmp file '" + filename + "'"); + } + advance(); + } + + void close() { + if (m_in && m_in->is_open()) m_in->close(); + m_in.reset(); + } + + Kmer const& operator*() const { return m_current; } + skew_partition_kmer_iterator& operator++() { + advance(); + return *this; + } + +private: + std::shared_ptr m_in; + Kmer m_current; + + void advance() { + decltype(Kmer{}.bits) bits; + m_in->read(reinterpret_cast(&bits), sizeof(bits)); + if (m_in->gcount() != static_cast(sizeof(bits))) return; + uint32_t pib; + m_in->read(reinterpret_cast(&pib), sizeof(pib)); // skip pos_in_bucket + m_current.bits = bits; + } +}; + template void dictionary_builder::build_sparse_and_skew_index( dictionary& d) // @@ -504,6 +560,11 @@ void dictionary_builder::build_sparse_and_skew_index( mphf_build_config.verbose = false; mphf_build_config.num_threads = build_config.num_threads; mphf_build_config.avg_partition_size = constants::avg_partition_size; + /* External-memory PHF: bound RAM by `--ram-limit` and spill hashes + to `tmp_dirname` rather than holding the partition's keys + (~16 B/kmer) and their hashes simultaneously in RAM. */ + mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2; + mphf_build_config.tmp_dir = build_config.tmp_dirname; uint64_t lower = min_size; uint64_t upper = 2 * lower; @@ -525,59 +586,61 @@ void dictionary_builder::build_sparse_and_skew_index( if (n > 0) // { - std::vector kmers; - std::vector positions_in_bucket; - kmers.reserve(n); - positions_in_bucket.reserve(n); - - { - const std::string fn = skew_partition_filename(partition_id); - std::ifstream in(fn, std::ifstream::binary); - if (!in.is_open()) { - throw std::runtime_error("cannot open skew-partition tmp file"); - } - for (uint64_t i = 0; i != n; ++i) { - Kmer kmer; - in.read(reinterpret_cast(&kmer.bits), sizeof(kmer.bits)); - uint32_t pib; - in.read(reinterpret_cast(&pib), sizeof(pib)); - kmers.push_back(kmer); - positions_in_bucket.push_back(pib); - } - in.close(); - std::remove(fn.c_str()); - } - - bits::compact_vector::builder cvb_positions; - cvb_positions.resize(n, num_bits_per_pos); + const std::string fn = skew_partition_filename(partition_id); if (build_config.verbose) { const uint64_t avg_partition_size = - pthash::compute_avg_partition_size(kmers.size(), mphf_build_config); + pthash::compute_avg_partition_size(n, mphf_build_config); const uint64_t pthash_num_partitions = - pthash::compute_num_partitions(kmers.size(), avg_partition_size); + pthash::compute_num_partitions(n, avg_partition_size); assert(pthash_num_partitions > 0); - std::cout << " building MPHF with " << mphf_build_config.num_threads - << " threads and " << pthash_num_partitions + std::cout << " building MPHF (external memory) with " + << mphf_build_config.num_threads << " threads and " + << pthash_num_partitions << " partitions (avg. partition size = " << avg_partition_size << ")..." << std::endl; } + /* (1) Build the MPHF by streaming kmers from the partition + file. pthash's external-memory builder spills hashes + to tmp_dir under its own RAM budget; the iterator's + footprint is constant. */ auto& F = mphfs[partition_id]; - F.build_in_internal_memory(kmers.begin(), kmers.size(), mphf_build_config); + { + skew_partition_kmer_iterator iter; + iter.open(fn); + F.build_in_external_memory(iter, n, mphf_build_config); + iter.close(); + } if (build_config.verbose) { - std::cout << " built mphs[" << partition_id << "] for " << kmers.size() + std::cout << " built mphs[" << partition_id << "] for " << F.num_keys() << " kmers; bits/key = " << static_cast(F.num_bits()) / F.num_keys() << std::endl; } - for (uint64_t i = 0; i != kmers.size(); ++i) { - Kmer kmer = kmers[i]; - uint64_t pos = F(kmer); - uint32_t pos_in_bucket = positions_in_bucket[i]; - cvb_positions.set(pos, pos_in_bucket); + /* (2) Re-stream the file to fill cvb_positions: for each + (kmer, pos_in_bucket), set cvb_positions[F(kmer)] = + pos_in_bucket. Only cvb_positions itself stays in RAM + (n * num_bits_per_pos bits, the actual stored output). */ + bits::compact_vector::builder cvb_positions; + cvb_positions.resize(n, num_bits_per_pos); + { + std::ifstream in(fn, std::ifstream::binary); + if (!in.is_open()) { + throw std::runtime_error("cannot open skew-partition tmp file"); + } + for (uint64_t i = 0; i != n; ++i) { + Kmer kmer; + in.read(reinterpret_cast(&kmer.bits), sizeof(kmer.bits)); + uint32_t pib; + in.read(reinterpret_cast(&pib), sizeof(pib)); + cvb_positions.set(F(kmer), pib); + } + in.close(); } + std::remove(fn.c_str()); + auto& P = positions[partition_id]; cvb_positions.build(P); From 5f9ec800b7db6d010a7f8a6bf0dec2ab2ed9d63d Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 5 May 2026 02:56:05 +0000 Subject: [PATCH 08/32] step 7.1 + 7.2 phase A: drop mmap; single ifstream pass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes the last mmap from the build hot path. The merged minimizers file used to be opened via mm::file_source and walked twice — once for stats and once to populate a size-sorted in-RAM `buckets` vector — and then walked again in step 7.2 phase (A) via pointers into the mmap. The recent "point bucket_type into the mmap" change pushed even more work onto the page cache, which violates the user's hard RAM budget (mmap pages can be resident up to file size when the machine has enough memory). This commit replaces all of that with two sequential std::ifstream passes (no mmap) over the merged minimizers file: Pass 1 (stats): unchanged in spirit. Uses a new `streaming_minimizer_bucket_reader` that buffers one bucket at a time (peak ~ max_bucket_size * 18 B). Feeds `buckets_statistics` exactly as before. Pass 2 (combined sparse + heavy + emit kmer requests): folds the former step 7.1 main pass and step 7.2 phase (A) into a single bucket-by-bucket loop. The size-sorted iteration over `buckets` is gone; instead: - `begin_buckets_of_size[s]` is precomputed from the bucket-size histogram (new accessor `buckets_statistics::num_buckets_of_size`), - mid_load positions are written via per-size cursors using `compact_vector::builder::set` instead of `push_back`, - heavy_load positions are appended in file order via a single monotone cursor, - heavy buckets emit kmer-extraction requests in-line. The `buckets` vector and the entire `tuples` array are gone. No memory mapping anywhere in step 7. RAM footprint of step 7.1 is now bounded by: - max_bucket_size * 18 B (one bucket at a time), - the sparse-index builders being assembled (proportional to non-singleton positions, bits-packed), - the kmer-extraction request buffer (~ ram_limit / 4). Output bytes differ from the previous commit because the ordering of positions inside mid_load_buckets and heavy_load_buckets is now file order instead of size-sorted order; the codewords are updated to match. The index is self-consistent: full --check passes on regular, --canonical, multi-thread (-t 4), and --weighted builds, and a streaming- save round-trip via `sshash check` and `sshash query` returns "EVERYTHING OK!" and 100% positive matches. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- include/buckets_statistics.hpp | 6 + src/builder/build_sparse_and_skew_index.cpp | 510 ++++++++++---------- 2 files changed, 261 insertions(+), 255 deletions(-) diff --git a/include/buckets_statistics.hpp b/include/buckets_statistics.hpp index 6f582d7..13676f1 100644 --- a/include/buckets_statistics.hpp +++ b/include/buckets_statistics.hpp @@ -59,6 +59,12 @@ struct buckets_statistics { uint64_t max_bucket_size() const { return m_max_bucket_size; } uint64_t max_sparse_buckets_per_size() const { return m_max_sparse_buckets_per_size; } + /* Histogram bin: number of buckets whose size equals `s`. Bins beyond + MAX_BUCKET_SIZE are not tracked individually and return 0. */ + uint64_t num_buckets_of_size(uint64_t s) const { + return s < m_bucket_sizes.size() ? m_bucket_sizes[s] : uint64_t(0); + } + void print_full() const { std::cout << "=== bucket statistics (full) === \n"; for (uint64_t bucket_size = 1, prev_bucket_size = 0, prev_kmers_in_buckets = 0, diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp index 0a4a3f8..a45df4e 100644 --- a/src/builder/build_sparse_and_skew_index.cpp +++ b/src/builder/build_sparse_and_skew_index.cpp @@ -36,6 +36,55 @@ struct kmer_extraction_request { }; #pragma pack(pop) +/* + Streaming reader over the merged minimizers file. Reads minimizer_tuple + records via std::ifstream (no mmap), and groups consecutive tuples by + minimizer into "buckets" — exactly as `minimizers_tuples_iterator` does + over an mmap'd buffer, but with bounded RAM (~ one bucket at a time). + + The caller passes a vector to receive the bucket's tuples; for typical + inputs this peaks at max_bucket_size * sizeof(minimizer_tuple). +*/ +struct streaming_minimizer_bucket_reader { + void open(std::string const& filename) { + m_in.open(filename, std::ifstream::binary); + if (!m_in.is_open()) { + throw std::runtime_error("cannot open minimizers tmp file '" + filename + "'"); + } + // Read first record into the lookahead slot, if any. + m_in.read(reinterpret_cast(&m_lookahead), sizeof(minimizer_tuple)); + m_eof = (m_in.gcount() != static_cast(sizeof(minimizer_tuple))); + } + + void close() { + if (m_in.is_open()) m_in.close(); + } + + bool has_next_bucket() const { return !m_eof; } + + /* Read the next bucket into `bucket_out` (cleared first). All tuples in + a bucket share the same minimizer. Returns the bucket's minimizer. */ + uint64_t next_bucket(std::vector& bucket_out) { + bucket_out.clear(); + assert(!m_eof); + const uint64_t mm = m_lookahead.minimizer; + do { + bucket_out.push_back(m_lookahead); + m_in.read(reinterpret_cast(&m_lookahead), sizeof(minimizer_tuple)); + if (m_in.gcount() != static_cast(sizeof(minimizer_tuple))) { + m_eof = true; + break; + } + } while (m_lookahead.minimizer == mm); + return mm; + } + +private: + std::ifstream m_in; + minimizer_tuple m_lookahead; + bool m_eof = true; +}; + /* Forward iterator over a per-skew-partition tmp file produced by step 7.2 phase (B). Each record is `(kmer.bits, uint32_t pos_in_bucket)`. @@ -104,41 +153,51 @@ void dictionary_builder::build_sparse_and_skew_index( const uint64_t min_size = 1ULL << constants::min_l; const uint64_t num_bits_per_offset = strings_offsets_builder.num_bits_per_offset(); - mm::file_source input(minimizers.get_minimizers_filename(), - mm::advice::sequential); + const std::string minimizers_filename = minimizers.get_minimizers_filename(); buckets_statistics buckets_stats(num_minimizers, num_kmers, num_minimizer_positions); uint64_t num_buckets_larger_than_1_not_in_skew_index = 0; uint64_t num_buckets_in_skew_index = 0; - uint64_t num_super_kmers_in_buckets_larger_than_1 = 0; uint64_t num_minimizer_positions_of_buckets_larger_than_1 = 0; uint64_t num_minimizer_positions_of_buckets_in_skew_index = 0; /* - First pass: collect bucket statistics to compute tighter bound. + Pass 1: streaming statistics over the merged minimizers file. Buckets + are accumulated one at a time via std::ifstream-backed reads (no + mmap), so RAM usage is bounded by max_bucket_size * sizeof(tuple). */ - for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); // - it.has_next(); it.next()) // { - auto bucket = it.bucket(); - const uint64_t bucket_size = bucket.size(); - buckets_stats.add_bucket_size(bucket_size); - - if (bucket_size > 1) { - if (bucket_size <= min_size) { - ++num_buckets_larger_than_1_not_in_skew_index; - num_minimizer_positions_of_buckets_larger_than_1 += bucket_size; - } else { - ++num_buckets_in_skew_index; - num_minimizer_positions_of_buckets_in_skew_index += bucket_size; + streaming_minimizer_bucket_reader reader; + reader.open(minimizers_filename); + std::vector bucket_buf; + while (reader.has_next_bucket()) { + reader.next_bucket(bucket_buf); + uint64_t bucket_size = 0; + { + uint64_t prev = constants::invalid_uint64; + for (auto const& mt : bucket_buf) { + if (mt.pos_in_seq != prev) { + ++bucket_size; + prev = mt.pos_in_seq; + } + } + } + buckets_stats.add_bucket_size(bucket_size); + if (bucket_size > 1) { + if (bucket_size <= min_size) { + ++num_buckets_larger_than_1_not_in_skew_index; + num_minimizer_positions_of_buckets_larger_than_1 += bucket_size; + } else { + ++num_buckets_in_skew_index; + num_minimizer_positions_of_buckets_in_skew_index += bucket_size; + } + } + for (auto const& mt : bucket_buf) { + buckets_stats.add_num_kmers_in_super_kmer(bucket_size, mt.num_kmers_in_super_kmer); } - num_super_kmers_in_buckets_larger_than_1 += bucket.num_super_kmers(); - } - - for (auto mt : bucket) { - buckets_stats.add_num_kmers_in_super_kmer(bucket_size, mt.num_kmers_in_super_kmer); } + reader.close(); } assert(buckets_stats.num_buckets() == num_minimizers); @@ -162,20 +221,17 @@ void dictionary_builder::build_sparse_and_skew_index( std::cout << "num_bits_for_control = " << num_bits_for_control << std::endl; } - bits::compact_vector::builder control_codewords_builder; - control_codewords_builder.resize(num_minimizers, num_bits_for_control); - - strings_offsets_builder.build(d.m_spss.strings_offsets); - /* `d.m_spss.strings` is materialized later, in step 8, from the on-disk - strings tmp file owned by `strings_builder`. Step 7.2 phase (B) reads - directly from the file via a `disk_backed_strings::reader` window. */ - - /* step 1. build sparse index */ - assert(buckets_stats.num_buckets() == num_minimizers); - const uint64_t max_bucket_size = buckets_stats.max_bucket_size(); const uint64_t log2_max_bucket_size = std::ceil(std::log2(max_bucket_size)); + uint64_t num_partitions = constants::max_l - constants::min_l + 1; + if (max_bucket_size < min_size) { + num_partitions = 0; + } else if (max_bucket_size < (1ULL << constants::max_l)) { + num_partitions = log2_max_bucket_size - constants::min_l; + } + assert(num_partitions <= 8); // so that we need 3 bits to encode a partition_id + if (build_config.verbose) { std::cout << "num_buckets_larger_than_1_not_in_skew_index " << num_buckets_larger_than_1_not_in_skew_index << "/" @@ -189,55 +245,6 @@ void dictionary_builder::build_sparse_and_skew_index( << std::endl; std::cout << "max_bucket_size " << max_bucket_size << std::endl; std::cout << "log2_max_bucket_size " << log2_max_bucket_size << std::endl; - } - - std::vector buckets; - buckets.reserve(num_buckets_larger_than_1_not_in_skew_index + num_buckets_in_skew_index); - - /* Second pass: register buckets > 1 (pointing directly into the mmap'd - `input`, no copy) and handle size-1 buckets inline. */ - for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); // - it.has_next(); it.next()) // - { - const uint64_t bucket_id = it.minimizer(); - auto bucket = it.bucket(); - const uint64_t bucket_size = bucket.size(); - if (bucket_size == 1) { - // Handle size-1 buckets: encode directly into control codewords - uint64_t prev_pos_in_seq = constants::invalid_uint64; - for (auto mt : bucket) { - if (mt.pos_in_seq != prev_pos_in_seq) { - /* - For minimizers occurring once, store a (log(N)+1)-bit - code, as follows: |offset|0|, i.e., the LSB is 0. - */ - uint64_t code = mt.pos_in_seq << 1; // first LS bit encodes status code: 0 - assert(code < (uint64_t(1) << num_bits_for_control)); - control_codewords_builder.set(bucket_id, code); - prev_pos_in_seq = mt.pos_in_seq; - } - } - } else { - /* Buckets > 1: store pointers directly into the mmap'd `input`. - `input` is kept open through step 7.2 phase (A). */ - buckets.push_back(bucket_type(bucket.begin_ptr(), bucket.end_ptr())); - } - } - assert(buckets.size() == - num_buckets_larger_than_1_not_in_skew_index + num_buckets_in_skew_index); - - std::sort(buckets.begin(), buckets.end(), - [](bucket_type const& x, bucket_type const& y) { return x.size() < y.size(); }); - - uint64_t num_partitions = constants::max_l - constants::min_l + 1; - if (max_bucket_size < min_size) { - num_partitions = 0; - } else if (max_bucket_size < (1ULL << constants::max_l)) { - num_partitions = log2_max_bucket_size - constants::min_l; - } - assert(num_partitions <= 8); // so that we need 3 bits to encode a partition_id - - if (build_config.verbose) { std::cout << "num_partitions in skew index " << num_partitions << std::endl; std::cout << "num_minimizer_positions_of_buckets_larger_than_1 " << num_minimizer_positions_of_buckets_larger_than_1 << "/" @@ -253,95 +260,193 @@ void dictionary_builder::build_sparse_and_skew_index( << "%)" << std::endl; } - { - bits::compact_vector::builder mid_load_buckets_builder; - bits::compact_vector::builder heavy_load_buckets_builder; - mid_load_buckets_builder.resize(num_minimizer_positions_of_buckets_larger_than_1, - num_bits_per_offset); - heavy_load_buckets_builder.resize(num_minimizer_positions_of_buckets_in_skew_index, - num_bits_per_offset); - - std::vector begin_buckets_of_size; - begin_buckets_of_size.resize(min_size + 1, 0); - - uint64_t curr_bucket_size = 2; - uint64_t list_id = 0; - uint64_t mid_load_buckets_size = 0; - uint64_t heavy_load_buckets_size = 0; - - uint64_t partition_id = 0; - uint64_t lower = min_size; - uint64_t upper = 2 * lower; + /* Materialize strings_offsets now: needed below to decode pos_in_seq + into absolute offsets when emitting heavy-bucket kmer requests. + `d.m_spss.strings` is materialized later in step 8 (or stream-saved + directly to disk). */ + strings_offsets_builder.build(d.m_spss.strings_offsets); - for (auto bucket : buckets) { - const uint64_t bucket_size = bucket.size(); - assert(bucket_size >= 2); + /* Precompute the layout of mid_load_buckets from the bucket-size + histogram. begin_buckets_of_size[s] is the start offset (in + positions, not bits) of size-s bucket positions in mid_load_buckets; + it lets us write each bucket's positions in place during the + single-pass build, without needing to sort buckets by size. */ + std::vector begin_buckets_of_size(min_size + 1, 0); + for (uint64_t s = 3; s <= min_size; ++s) { + begin_buckets_of_size[s] = static_cast( // + begin_buckets_of_size[s - 1] + + buckets_stats.num_buckets_of_size(s - 1) * (s - 1)); + } - if (bucket_size > curr_bucket_size) { - while (bucket_size > curr_bucket_size) ++curr_bucket_size; - if (curr_bucket_size <= min_size) { - begin_buckets_of_size[curr_bucket_size] = mid_load_buckets_size; - } else { - while (curr_bucket_size > upper) { - lower = upper; - upper = 2 * lower; - partition_id += 1; - if (partition_id == num_partitions - 1) upper = max_bucket_size; + bits::compact_vector::builder control_codewords_builder; + bits::compact_vector::builder mid_load_buckets_builder; + bits::compact_vector::builder heavy_load_buckets_builder; + control_codewords_builder.resize(num_minimizers, num_bits_for_control); + mid_load_buckets_builder.resize(num_minimizer_positions_of_buckets_larger_than_1, + num_bits_per_offset); + heavy_load_buckets_builder.resize(num_minimizer_positions_of_buckets_in_skew_index, + num_bits_per_offset); + + /* Per-size cursor for mid_load (initialized to begin_buckets_of_size) + and per-size list_id counter; monotone cursor for heavy_load. */ + std::vector mid_load_cursor(min_size + 1, 0); + for (uint64_t s = 2; s <= min_size; ++s) mid_load_cursor[s] = begin_buckets_of_size[s]; + std::vector list_id_per_size(min_size + 1, 0); + uint64_t heavy_load_cursor = 0; + + /* Per-partition kmer counts; filled during the heavy branch of the + combined pass below. */ + std::vector num_kmers_in_partition(num_partitions, 0); + + /* Skew-index tmp file naming. */ + const uint64_t skew_run_id = pthash::clock_type::now().time_since_epoch().count(); + auto request_run_filename = [&](uint64_t id) { + std::stringstream ss; + ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id + << ".kmer_requests." << id << ".bin"; + return ss.str(); + }; + auto skew_partition_filename = [&](uint64_t pid) { + std::stringstream ss; + ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id + << ".skew_kmers." << pid << ".bin"; + return ss.str(); + }; + + /* External-sort buffer for kmer-extraction requests (formerly step 7.2 + phase A; now folded into the combined pass). */ + std::atomic num_request_runs{0}; + const uint64_t request_buffer_capacity = std::max( + uint64_t(1) << 16, + (build_config.ram_limit_in_GiB * essentials::GiB) / + (4 * sizeof(kmer_extraction_request))); + std::vector request_buffer; + request_buffer.reserve(request_buffer_capacity); + auto flush_request_buffer = [&]() { + if (request_buffer.empty()) return; + parallel_sort(request_buffer, build_config.num_threads, + [](kmer_extraction_request const& a, kmer_extraction_request const& b) { + return a.starting_pos < b.starting_pos; + }); + const uint64_t id = num_request_runs.fetch_add(1); + const std::string fn = request_run_filename(id); + if (build_config.verbose) { + std::cout << "saving to file '" << fn << "'..." << std::endl; + } + std::ofstream out(fn, std::ofstream::binary); + if (!out.is_open()) throw std::runtime_error("cannot open file"); + out.write(reinterpret_cast(request_buffer.data()), + request_buffer.size() * sizeof(kmer_extraction_request)); + out.close(); + request_buffer.clear(); + }; + + /* Map bucket size → partition_id for heavy buckets. num_partitions <= 8 + so this loop is constant time. */ + auto partition_for_size = [&](uint64_t bucket_size) -> uint64_t { + assert(bucket_size > min_size); + uint64_t pid = 0; + uint64_t upper = 2 * min_size; + while (bucket_size > upper && pid + 1 < num_partitions) { + upper *= 2; + ++pid; + } + return pid; + }; + + /* + Combined pass: stream the merged minimizers file once and, per + bucket, write the appropriate part of the sparse index. For heavy + buckets we also emit kmer-extraction requests in-line (what was + formerly step 7.2 phase A). No mmap; no in-RAM `buckets` array. + */ + { + streaming_minimizer_bucket_reader reader; + reader.open(minimizers_filename); + std::vector bucket_buf; + while (reader.has_next_bucket()) { + const uint64_t bucket_id = reader.next_bucket(bucket_buf); + uint64_t bucket_size = 0; + { + uint64_t prev = constants::invalid_uint64; + for (auto const& mt : bucket_buf) { + if (mt.pos_in_seq != prev) { + ++bucket_size; + prev = mt.pos_in_seq; } } - list_id = 0; } - if (curr_bucket_size <= min_size) { + if (bucket_size == 1) { + /* Singleton: code = |offset|0|, LSB = 0. */ + const uint64_t code = bucket_buf.front().pos_in_seq << 1; + assert(code < (uint64_t(1) << num_bits_for_control)); + control_codewords_builder.set(bucket_id, code); + } else if (bucket_size <= min_size) { + /* Mid-load: write positions at the per-size cursor and + assign the next list_id for this size. */ + const uint64_t list_id = list_id_per_size[bucket_size]++; + const uint64_t code = + (((list_id << constants::min_l) | (bucket_size - 2)) << 2) | 1; + assert(code < (uint64_t(1) << num_bits_for_control)); + control_codewords_builder.set(bucket_id, code); + + uint64_t cursor = mid_load_cursor[bucket_size]; uint64_t prev_pos_in_seq = constants::invalid_uint64; - for (auto mt : bucket) { - if (prev_pos_in_seq == constants::invalid_uint64) { // only once - uint64_t p = (list_id << constants::min_l) | (curr_bucket_size - 2); - uint64_t code = (p << 2) | 1; // first two LS bits encode status code: 01 - assert(code < (uint64_t(1) << num_bits_for_control)); - control_codewords_builder.set(mt.minimizer, code); - } + for (auto const& mt : bucket_buf) { if (mt.pos_in_seq != prev_pos_in_seq) { - mid_load_buckets_builder.push_back(mt.pos_in_seq); + mid_load_buckets_builder.set(cursor++, mt.pos_in_seq); prev_pos_in_seq = mt.pos_in_seq; - mid_load_buckets_size += 1; } } - ++list_id; + mid_load_cursor[bucket_size] = cursor; } else { + /* Heavy: write positions at the monotone cursor, set the + codeword (encodes the start offset and partition id), + and emit kmer-extraction requests for each super-kmer + in the bucket. */ + const uint64_t partition_id = partition_for_size(bucket_size); + assert(partition_id < num_partitions); + const uint64_t bucket_begin = heavy_load_cursor; + const uint64_t code = (((bucket_begin << 3) | partition_id) << 2) | 3; + assert(code < (uint64_t(1) << num_bits_for_control)); + control_codewords_builder.set(bucket_id, code); + + uint32_t pos_in_bucket = uint32_t(-1); uint64_t prev_pos_in_seq = constants::invalid_uint64; - for (auto mt : bucket) { - if (prev_pos_in_seq == constants::invalid_uint64) { // only once - assert(partition_id < 8); - uint64_t p = (heavy_load_buckets_size << 3) | partition_id; - uint64_t code = (p << 2) | 3; // first two LS bits encode status code: 11 - assert(code < (uint64_t(1) << num_bits_for_control)); - control_codewords_builder.set(mt.minimizer, code); - } + for (auto const& mt : bucket_buf) { + num_kmers_in_partition[partition_id] += mt.num_kmers_in_super_kmer; if (mt.pos_in_seq != prev_pos_in_seq) { - heavy_load_buckets_builder.push_back(mt.pos_in_seq); + heavy_load_buckets_builder.set(heavy_load_cursor++, mt.pos_in_seq); prev_pos_in_seq = mt.pos_in_seq; - heavy_load_buckets_size += 1; + ++pos_in_bucket; } + assert(mt.pos_in_seq >= mt.pos_in_kmer); + const uint64_t abs_offset = + d.m_spss.strings_offsets.decode(mt.pos_in_seq).absolute_offset; + const uint64_t starting_pos = abs_offset - mt.pos_in_kmer; + if (request_buffer.size() == request_buffer_capacity) flush_request_buffer(); + request_buffer.emplace_back(starting_pos, uint32_t(partition_id), + pos_in_bucket, + uint32_t(mt.num_kmers_in_super_kmer)); } } } - - d.m_ssi.begin_buckets_of_size = std::move(begin_buckets_of_size); - - control_codewords_builder.build(d.m_ssi.codewords.control_codewords); - mid_load_buckets_builder.build(d.m_ssi.mid_load_buckets); - heavy_load_buckets_builder.build(d.m_ssi.ski.heavy_load_buckets); + reader.close(); + flush_request_buffer(); } - timer.stop(); + /* Build sparse-index structures into the dictionary. */ + d.m_ssi.begin_buckets_of_size = std::move(begin_buckets_of_size); + control_codewords_builder.build(d.m_ssi.codewords.control_codewords); + mid_load_buckets_builder.build(d.m_ssi.mid_load_buckets); + heavy_load_buckets_builder.build(d.m_ssi.ski.heavy_load_buckets); + timer.stop(); build_stats.add("step 7.1 (build sparse index)", uint64_t(timer.elapsed())); - if (build_config.verbose) { print_time(uint64_t(timer.elapsed()), "step 7.1 (build sparse index)"); } - timer.reset(); if (num_buckets_in_skew_index == 0) { @@ -352,119 +457,14 @@ void dictionary_builder::build_sparse_and_skew_index( /* step 2. build skew index - We do this in three sub-steps: - (A) walk the heavy-load buckets in size-sorted order, decode each - super-kmer's absolute starting position in `strings` and emit a - `kmer_extraction_request`. Requests are sort+flushed to disk in - chunks (external sort by `starting_pos`). - (B) merge the sorted runs and walk `strings` in a single forward - sequential pass, extracting the requested k-mers. For each k-mer - we append `(kmer.bits, pos_in_bucket)` to a per-partition tmp file. - (C) for each partition, read its tmp file, build the MPHF, then build - the positions compact vector. The skew index is assembled - partition by partition. - - Avoiding the random access pattern over `strings` in (B) is the - precondition for moving `strings` itself out of RAM in a later step. + Phases (B) and (C) below; phase (A) was folded into the combined + sparse pass above. Phase (B) extracts k-mers from `strings` in a + single forward sweep guided by the externally-sorted requests, and + phase (C) builds the per-partition MPHF + positions in external + memory from the per-partition kmer files. */ timer.start(); - std::vector num_kmers_in_partition(num_partitions, 0); - - /* unique run identifier for the tmp files produced by this step */ - const uint64_t skew_run_id = pthash::clock_type::now().time_since_epoch().count(); - auto request_run_filename = [&](uint64_t id) { - std::stringstream ss; - ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id - << ".kmer_requests." << id << ".bin"; - return ss.str(); - }; - auto skew_partition_filename = [&](uint64_t pid) { - std::stringstream ss; - ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id - << ".skew_kmers." << pid << ".bin"; - return ss.str(); - }; - - /* (A) emit kmer-extraction requests, externally sorted by `starting_pos` */ - std::atomic num_request_runs{0}; - { - const uint64_t request_buffer_capacity = std::max( - uint64_t(1) << 16, - (build_config.ram_limit_in_GiB * essentials::GiB) / - (4 * sizeof(kmer_extraction_request))); - - std::vector request_buffer; - request_buffer.reserve(request_buffer_capacity); - - auto flush_request_buffer = [&]() { - if (request_buffer.empty()) return; - parallel_sort(request_buffer, build_config.num_threads, - [](kmer_extraction_request const& a, - kmer_extraction_request const& b) { - return a.starting_pos < b.starting_pos; - }); - const uint64_t id = num_request_runs.fetch_add(1); - const std::string fn = request_run_filename(id); - if (build_config.verbose) { - std::cout << "saving to file '" << fn << "'..." << std::endl; - } - std::ofstream out(fn, std::ofstream::binary); - if (!out.is_open()) throw std::runtime_error("cannot open file"); - out.write(reinterpret_cast(request_buffer.data()), - request_buffer.size() * sizeof(kmer_extraction_request)); - out.close(); - request_buffer.clear(); - }; - - uint64_t partition_id = 0; - uint64_t lower = min_size; - uint64_t upper = 2 * lower; - - for (uint64_t i = buckets.size() - num_buckets_in_skew_index; i < buckets.size(); ++i) // - { - auto const& bucket = buckets[i]; - const uint64_t bucket_size = bucket.size(); - while (bucket_size > upper) // - { - lower = upper; - upper = 2 * lower; - partition_id += 1; - if (partition_id == num_partitions - 1) upper = max_bucket_size; - } - assert(bucket_size > lower and bucket_size <= upper); - assert(partition_id < num_partitions); - - uint32_t pos_in_bucket = uint32_t(-1); - uint64_t prev_pos_in_seq = constants::invalid_uint64; - for (auto mt : bucket) // - { - num_kmers_in_partition[partition_id] += mt.num_kmers_in_super_kmer; - if (mt.pos_in_seq != prev_pos_in_seq) { - prev_pos_in_seq = mt.pos_in_seq; - ++pos_in_bucket; - } - assert(mt.pos_in_seq >= mt.pos_in_kmer); - const uint64_t abs_offset = - d.m_spss.strings_offsets.decode(mt.pos_in_seq).absolute_offset; - const uint64_t starting_pos = abs_offset - mt.pos_in_kmer; - if (request_buffer.size() == request_buffer_capacity) flush_request_buffer(); - request_buffer.emplace_back(starting_pos, // - uint32_t(partition_id), // - pos_in_bucket, // - uint32_t(mt.num_kmers_in_super_kmer)); // - } - } - flush_request_buffer(); - assert(partition_id == num_partitions - 1); - } - - /* `buckets` and the mmap'd `input` are no longer needed: phase (B) walks - the sorted requests and per-partition tmp files, phase (C) walks the - per-partition tmp files. Free both now to bound RAM. */ - std::vector().swap(buckets); - input.close(); - if (build_config.verbose) { uint64_t total_kmers_in_skew = 0; for (uint64_t p = 0; p != num_partitions; ++p) { From a6ac614fd5288ecf276aecc30a55e03a3119a550 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 5 May 2026 06:29:36 +0000 Subject: [PATCH 09/32] remove all remaining mmap from the SSHash build path This commit eliminates the last three mm::file_source usages in the build, so the only mmap left in step 7 is inside pthash's external-memory PHF builder (which manages its own RAM via config.ram). (1) file_merging_iterator: each input run is now read with a bounded buffered std::ifstream (default 4096 records per stream) instead of mm::file_source. The winner-tree merge logic is unchanged; comparisons just use the in-RAM buffer's current value rather than a pointer into mmap'd memory. RSS for the merge is now bounded by `num_runs * buffer_records * sizeof(T)` regardless of run sizes. (2) minimizers_tuples::merge: the post-rename single-file count path used to mmap the merged file via mm::file_source and walk it with minimizers_tuples_iterator. It now uses streaming_minimizer_bucket_reader (hoisted from build_sparse_and_skew_index.cpp into util.hpp) for a pure ifstream pass. (3) dictionary_builder::build_mphf: replaces mm::file_source + minimizers_tuples_iterator with a new streaming_minimizers_iterator that yields each distinct minimizer once via std::ifstream. The iterator is copyable (shared_ptr), as required by pthash's by-value `build_in_external_memory` signature. Verified byte-identical output vs the previous commit on salmonella_enterica m=7, plus full --check on regular, --canonical, multi-thread (-t 4), and --weighted builds, plus a streaming-save round-trip (sshash check + sshash query) returning "EVERYTHING OK!" on all five suites and 100% positive matches. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- include/builder/dictionary_builder.hpp | 10 +- include/builder/file_merging_iterator.hpp | 183 +++++++++++++------- include/builder/util.hpp | 153 ++++++++++++++-- src/builder/build_sparse_and_skew_index.cpp | 49 ------ 4 files changed, 272 insertions(+), 123 deletions(-) diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp index 3c4c5fe..29ad04e 100644 --- a/include/builder/dictionary_builder.hpp +++ b/include/builder/dictionary_builder.hpp @@ -169,11 +169,13 @@ struct dictionary_builder // void build_mphf(dictionary& d) { const uint64_t num_minimizers = minimizers.num_minimizers(); - mm::file_source input(minimizers.get_minimizers_filename(), - mm::advice::sequential); - minimizers_tuples_iterator iterator(input.data(), input.data() + input.size()); + /* Stream minimizers from disk via std::ifstream (no mmap); the + iterator yields each distinct minimizer once, matching what + `minimizers_tuples_iterator` did over the mmap'd file. */ + streaming_minimizers_iterator iterator; + iterator.open(minimizers.get_minimizers_filename()); d.m_ssi.codewords.build(iterator, num_minimizers, build_config); - input.close(); + iterator.close(); assert(d.m_ssi.codewords.size() == num_minimizers); } diff --git a/include/builder/file_merging_iterator.hpp b/include/builder/file_merging_iterator.hpp index 85b95ac..ff191ee 100644 --- a/include/builder/file_merging_iterator.hpp +++ b/include/builder/file_merging_iterator.hpp @@ -1,42 +1,59 @@ #pragma once -#include -#include -#include #include +#include +#include +#include +#include +#include #include "util.hpp" namespace sshash { /* - Winner-tree-based implementation. + Winner-tree-based external-merge iterator over N sorted runs on disk. + + Each run is read with a small buffered std::ifstream (no mmap) so that + process RSS stays bounded by `num_files_to_merge * buffer_records * + sizeof(T)` regardless of total run size. Values are surfaced as + `T const&` from each stream's in-RAM buffer; the merge logic compares + those values directly instead of pointers into mmap'd memory. + + Required of T: + - copy-constructible / move-constructible, + - `static T T::max()` returning a strict upper bound (used as the + sentinel for exhausted streams in the winner tree), + - `bool operator<(T, T)`. */ template struct file_merging_iterator // { + static constexpr uint64_t default_buffer_records = uint64_t(1) << 12; // 4096 records const uint64_t scan_threshold = 16; template - file_merging_iterator(FileNamesIterator file_names_iterator, uint64_t num_files_to_merge) - : m_mm_files(num_files_to_merge) // + file_merging_iterator(FileNamesIterator file_names_iterator, uint64_t num_files_to_merge, + uint64_t buffer_records = default_buffer_records) // { - if (num_files_to_merge == 0) return; + if (num_files_to_merge == 0) { + m_num_files_to_merge = 0; + return; + } - /* open files and create the input iterators */ - m_iterators.reserve(num_files_to_merge); + m_streams.reserve(num_files_to_merge); for (uint64_t i = 0; i != num_files_to_merge; ++i, ++file_names_iterator) { - m_mm_files[i].open(*file_names_iterator, mm::advice::sequential); - m_iterators.push_back( - {m_mm_files[i].data(), m_mm_files[i].data() + m_mm_files[i].size()}); + m_streams.emplace_back(); + m_streams.back().open(*file_names_iterator, buffer_records); } m_num_files_to_merge = num_files_to_merge; m_min_idx = 0; - if (m_iterators.size() <= scan_threshold) { + if (m_streams.size() <= scan_threshold) { compute_min(); } else { - /* build a winner tree */ + /* build a winner tree (same shape as before, but the leaves + index into m_streams instead of carrying raw pointers). */ uint64_t n = num_files_to_merge; uint64_t m = 2 * n - 1; m_size = n; @@ -51,97 +68,143 @@ struct file_merging_iterator // bool has_next() { return m_num_files_to_merge != 0; } void next() { update(); } - T operator*() const { return *(m_iterators[m_min_idx].begin); } + T operator*() const { return m_streams[m_min_idx].current(); } void close() { - for (auto& mm_file : m_mm_files) mm_file.close(); - m_iterators.clear(); - m_mm_files.clear(); + for (auto& s : m_streams) s.close(); + m_streams.clear(); + m_streams.shrink_to_fit(); m_tree.clear(); + m_tree.shrink_to_fit(); } private: - struct pointer_pair { - T const* begin; - T const* end; + /* + A buffered, forward-only reader over a single run file. Reads in + chunks of `m_buf.size()` records via std::ifstream and presents a + T-by-reference current-value interface. + */ + struct buffered_stream { + buffered_stream() = default; + buffered_stream(buffered_stream const&) = delete; + buffered_stream& operator=(buffered_stream const&) = delete; + buffered_stream(buffered_stream&&) = default; + buffered_stream& operator=(buffered_stream&&) = default; + + void open(std::string const& filename, uint64_t buffer_records) { + m_buf.resize(std::max(1, buffer_records)); + m_in.open(filename, std::ifstream::binary); + if (!m_in.is_open()) { + throw std::runtime_error("cannot open run file '" + filename + "'"); + } + m_pos = 0; + m_size = 0; + m_eof = false; + refill(); + } + + void close() { + if (m_in.is_open()) m_in.close(); + m_buf.clear(); + m_buf.shrink_to_fit(); + m_pos = 0; + m_size = 0; + m_eof = true; + } + + bool empty() const { return m_pos >= m_size; } + + T const& current() const { + assert(!empty()); + return m_buf[m_pos]; + } + + void advance() { + assert(!empty()); + ++m_pos; + if (m_pos >= m_size && !m_eof) refill(); + } + + private: + std::ifstream m_in; + std::vector m_buf; + uint64_t m_pos = 0; + uint64_t m_size = 0; + bool m_eof = true; + + void refill() { + m_pos = 0; + m_in.read(reinterpret_cast(m_buf.data()), + static_cast(m_buf.size() * sizeof(T))); + const std::streamsize got = m_in.gcount(); + m_size = static_cast(got) / sizeof(T); + if (m_size == 0) m_eof = true; + } }; - std::vector m_iterators; - std::vector> m_mm_files; + + std::vector m_streams; std::vector m_tree; - uint64_t m_begin, m_size; - uint64_t m_min_idx, m_num_files_to_merge; + uint64_t m_begin = 0, m_size = 0; + uint64_t m_min_idx = 0, m_num_files_to_merge = 0; void update() { - if (m_iterators.size() <= scan_threshold) { // compute min with a linear scan - auto& it = m_iterators[m_min_idx]; - it.begin += 1; - if (it.begin == it.end) { - m_iterators.erase(m_iterators.begin() + m_min_idx); + if (m_streams.size() <= scan_threshold) { + auto& s = m_streams[m_min_idx]; + s.advance(); + if (s.empty()) { + m_streams.erase(m_streams.begin() + m_min_idx); m_min_idx = 0; --m_num_files_to_merge; if (m_num_files_to_merge == 0) return; } compute_min(); - } else { // update the winner tree + } else { // winner tree m_min_idx = m_tree[0]; - assert(m_min_idx < m_iterators.size()); - auto& it = m_iterators[m_min_idx]; - it.begin += 1; + assert(m_min_idx < m_streams.size()); + auto& s = m_streams[m_min_idx]; + s.advance(); uint64_t p = m_begin + m_min_idx; - p -= (p >= m_tree.size()) * m_size; // p is the index of the leaf - if (it.begin == it.end) { + p -= (p >= m_tree.size()) * m_size; // p is the leaf index + if (s.empty()) { m_tree[p] = uint32_t(-1); --m_num_files_to_merge; } const T inf = T::max(); while (p) { uint64_t is_r_child = (p & 1) == 0; - uint32_t i = 0; uint32_t l = m_tree[p - is_r_child]; uint32_t r = m_tree[p + 1 - is_r_child]; - - T const* ptr_l = (l == uint32_t(-1)) ? &inf : m_iterators[l].begin; - T const* ptr_r = (r == uint32_t(-1)) ? &inf : m_iterators[r].begin; - i = (*ptr_l < *ptr_r) ? l : r; - - /* same as this code but the one above uses CMOV */ - // if (l == uint32_t(-1)) { - // i = r; - // } else if (r == uint32_t(-1)) { - // i = l; - // } else { - // i = *(m_iterators[l].begin) < *(m_iterators[r].begin) ? l : r; - // } - + T const& vl = (l == uint32_t(-1)) ? inf : m_streams[l].current(); + T const& vr = (r == uint32_t(-1)) ? inf : m_streams[r].current(); + uint32_t i = (vl < vr) ? l : r; uint64_t parent = (p - 1) / 2; m_tree[parent] = i; p = parent; } m_min_idx = m_tree[0]; } - }; + } uint32_t build(uint32_t p) { if (p >= m_tree.size()) return uint32_t(-1); if (p >= m_size - 1) return m_tree[p]; // leaf uint32_t l = build(2 * p + 1); uint32_t r = build(2 * p + 2); - uint32_t i = 0; const T inf = T::max(); - T const* ptr_l = (l == uint32_t(-1)) ? &inf : m_iterators[l].begin; - T const* ptr_r = (r == uint32_t(-1)) ? &inf : m_iterators[r].begin; - i = (*ptr_l < *ptr_r) ? l : r; + T const& vl = (l == uint32_t(-1)) ? inf : m_streams[l].current(); + T const& vr = (r == uint32_t(-1)) ? inf : m_streams[r].current(); + uint32_t i = (vl < vr) ? l : r; m_tree[p] = i; return i; } void compute_min() { m_min_idx = 0; - auto min_val = *m_iterators.front().begin; - for (uint64_t i = 1; i != m_iterators.size(); ++i) { - assert(m_iterators[i].begin != m_iterators[i].end); - auto val = *m_iterators[i].begin; + T min_val = m_streams.front().current(); + for (uint64_t i = 1; i != m_streams.size(); ++i) { + assert(!m_streams[i].empty()); + T const& val = m_streams[i].current(); if (val < min_val) { min_val = val; m_min_idx = i; diff --git a/include/builder/util.hpp b/include/builder/util.hpp index 9b9b209..bd57038 100644 --- a/include/builder/util.hpp +++ b/include/builder/util.hpp @@ -1,7 +1,9 @@ #pragma once -#include #include +#include +#include +#include #include "file_merging_iterator.hpp" #include "parallel_sort.hpp" @@ -153,6 +155,126 @@ struct minimizers_tuples_iterator { } }; +/* + Streaming forward iterator over a sorted minimizers tmp file that + yields each distinct `minimizer` value exactly once (i.e., one value + per bucket). Equivalent to `minimizers_tuples_iterator` over an mmap'd + buffer, but reads from std::ifstream so RAM usage is constant. + + Copyable: pthash's `build_in_external_memory` takes the iterator by + value, so the underlying ifstream is held via shared_ptr. Copies share + the stream state; pthash's local copy advances the shared stream, and + the original at the call site is unused after the build returns. +*/ +struct streaming_minimizers_iterator { + using iterator_category = std::forward_iterator_tag; + using value_type = uint64_t; + using difference_type = std::ptrdiff_t; + using reference = uint64_t const&; + using pointer = uint64_t const*; + + streaming_minimizers_iterator() = default; + + void open(std::string const& filename) { + m_in = std::make_shared(filename, std::ifstream::binary); + if (!m_in->is_open()) { + throw std::runtime_error("cannot open minimizers tmp file '" + filename + "'"); + } + m_eof = false; + m_current = uint64_t(-1); + // Bootstrap: read the first tuple. + minimizer_tuple t; + m_in->read(reinterpret_cast(&t), sizeof(minimizer_tuple)); + if (m_in->gcount() != static_cast(sizeof(minimizer_tuple))) { + m_eof = true; + return; + } + m_current = t.minimizer; + } + + void close() { + if (m_in && m_in->is_open()) m_in->close(); + m_in.reset(); + } + + uint64_t operator*() const { return m_current; } + streaming_minimizers_iterator& operator++() { + advance_to_next_minimizer(); + return *this; + } + +private: + std::shared_ptr m_in; + uint64_t m_current = uint64_t(-1); + bool m_eof = true; + + void advance_to_next_minimizer() { + const uint64_t prev = m_current; + minimizer_tuple t; + while (true) { + m_in->read(reinterpret_cast(&t), sizeof(minimizer_tuple)); + if (m_in->gcount() != static_cast(sizeof(minimizer_tuple))) { + m_eof = true; + return; // m_current holds last value; pthash has consumed `num_minimizers` keys + } + if (t.minimizer != prev) { + m_current = t.minimizer; + return; + } + } + } +}; + +/* + Streaming reader over a minimizers tmp file. Reads minimizer_tuple + records via std::ifstream (no mmap), and groups consecutive tuples by + minimizer into "buckets" — exactly as `minimizers_tuples_iterator` does + over an mmap'd buffer, but with bounded RAM (~ one bucket at a time + plus one record of lookahead). + + The caller passes a vector to receive the bucket's tuples; for typical + inputs this peaks at max_bucket_size * sizeof(minimizer_tuple). +*/ +struct streaming_minimizer_bucket_reader { + void open(std::string const& filename) { + m_in.open(filename, std::ifstream::binary); + if (!m_in.is_open()) { + throw std::runtime_error("cannot open minimizers tmp file '" + filename + "'"); + } + // Read first record into the lookahead slot, if any. + m_in.read(reinterpret_cast(&m_lookahead), sizeof(minimizer_tuple)); + m_eof = (m_in.gcount() != static_cast(sizeof(minimizer_tuple))); + } + + void close() { + if (m_in.is_open()) m_in.close(); + } + + bool has_next_bucket() const { return !m_eof; } + + /* Read the next bucket into `bucket_out` (cleared first). All tuples in + a bucket share the same minimizer. Returns the bucket's minimizer. */ + uint64_t next_bucket(std::vector& bucket_out) { + bucket_out.clear(); + assert(!m_eof); + const uint64_t mm = m_lookahead.minimizer; + do { + bucket_out.push_back(m_lookahead); + m_in.read(reinterpret_cast(&m_lookahead), sizeof(minimizer_tuple)); + if (m_in.gcount() != static_cast(sizeof(minimizer_tuple))) { + m_eof = true; + break; + } + } while (m_lookahead.minimizer == mm); + return mm; + } + +private: + std::ifstream m_in; + minimizer_tuple m_lookahead; + bool m_eof = true; +}; + struct minimizers_tuples { minimizers_tuples() {} minimizers_tuples(build_configuration const& build_config) @@ -217,17 +339,28 @@ struct minimizers_tuples { assert(m_num_minimizers == 0); assert(m_num_minimizer_positions == 0); assert(m_num_super_kmers == 0); - mm::file_source input(get_minimizers_filename(), - mm::advice::sequential); - for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); - it.has_next(); it.next()) // - { - auto bucket = it.bucket(); + + /* Single-pass count via streaming ifstream (no mmap). */ + streaming_minimizer_bucket_reader reader; + reader.open(get_minimizers_filename()); + std::vector bucket_buf; + while (reader.has_next_bucket()) { + reader.next_bucket(bucket_buf); + uint64_t bucket_size = 0; + { + uint64_t prev = constants::invalid_uint64; + for (auto const& mt : bucket_buf) { + if (mt.pos_in_seq != prev) { + ++bucket_size; + prev = mt.pos_in_seq; + } + } + } m_num_minimizers += 1; - m_num_minimizer_positions += bucket.size(); - m_num_super_kmers += bucket.num_super_kmers(); + m_num_minimizer_positions += bucket_size; + m_num_super_kmers += bucket_buf.size(); } - input.close(); + reader.close(); return; } diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp index a45df4e..de7c963 100644 --- a/src/builder/build_sparse_and_skew_index.cpp +++ b/src/builder/build_sparse_and_skew_index.cpp @@ -36,55 +36,6 @@ struct kmer_extraction_request { }; #pragma pack(pop) -/* - Streaming reader over the merged minimizers file. Reads minimizer_tuple - records via std::ifstream (no mmap), and groups consecutive tuples by - minimizer into "buckets" — exactly as `minimizers_tuples_iterator` does - over an mmap'd buffer, but with bounded RAM (~ one bucket at a time). - - The caller passes a vector to receive the bucket's tuples; for typical - inputs this peaks at max_bucket_size * sizeof(minimizer_tuple). -*/ -struct streaming_minimizer_bucket_reader { - void open(std::string const& filename) { - m_in.open(filename, std::ifstream::binary); - if (!m_in.is_open()) { - throw std::runtime_error("cannot open minimizers tmp file '" + filename + "'"); - } - // Read first record into the lookahead slot, if any. - m_in.read(reinterpret_cast(&m_lookahead), sizeof(minimizer_tuple)); - m_eof = (m_in.gcount() != static_cast(sizeof(minimizer_tuple))); - } - - void close() { - if (m_in.is_open()) m_in.close(); - } - - bool has_next_bucket() const { return !m_eof; } - - /* Read the next bucket into `bucket_out` (cleared first). All tuples in - a bucket share the same minimizer. Returns the bucket's minimizer. */ - uint64_t next_bucket(std::vector& bucket_out) { - bucket_out.clear(); - assert(!m_eof); - const uint64_t mm = m_lookahead.minimizer; - do { - bucket_out.push_back(m_lookahead); - m_in.read(reinterpret_cast(&m_lookahead), sizeof(minimizer_tuple)); - if (m_in.gcount() != static_cast(sizeof(minimizer_tuple))) { - m_eof = true; - break; - } - } while (m_lookahead.minimizer == mm); - return mm; - } - -private: - std::ifstream m_in; - minimizer_tuple m_lookahead; - bool m_eof = true; -}; - /* Forward iterator over a per-skew-partition tmp file produced by step 7.2 phase (B). Each record is `(kmer.bits, uint32_t pos_in_bucket)`. From fe44326848e66b9998721ca5518ba7c1fd69809d Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 5 May 2026 06:43:03 +0000 Subject: [PATCH 10/32] strings_offsets: stream to disk during build The strings_offsets_builder used to be an in-RAM std::vector of size num_strings + 1, finalized into d.m_spss.strings_offsets at step 7.1. For inputs with many strings this is non-trivial RSS (8 B per string), held through steps 1-7. This commit replaces it with a disk-backed disk_backed_offsets_builder: - push_back() during step 1 spills to a tmp file under a small in-RAM write buffer (~32 KiB), - compute_minimizer_tuples (step 2) opens one `reader` per thread, positioned at the thread's index_begin, and walks forward sequentially via a bounded read buffer (~32 KiB / thread). The per-thread `[i]`, `[i+1]` access pattern is replaced with a single rolling `prev_offset = next()` cursor. - encode(offset, begin, string_id) is now a pure const method on the disk-backed builder (depends only on m_nb and m_size), preserving multi-threaded safety. - build(target) at step 7.1 streams the file's contents via a copyable forward iterator into target.m_seq's encode/build, so neither side materializes the offsets in RAM. The on-disk file is removed after build. To keep `target.m_seq` accessible from the external builder without exposing it broadly, `offsets` befriends `disk_backed_offsets_builder<...>` via a templated friend declaration; the concrete decoded_offsets / encoded_offsets types inherit that friendship. Verified byte-identical output vs the previous commit on salmonella_enterica m=7, plus full --check on regular, --canonical, multi-thread (-t 4), and --weighted, and a streaming-save round-trip (sshash check + sshash query) with all five "EVERYTHING OK!" suites and 100% positive matches. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- include/builder/dictionary_builder.hpp | 21 +- .../builder/disk_backed_offsets_builder.hpp | 327 ++++++++++++++++++ include/offsets.hpp | 7 + src/builder/compute_minimizer_tuples.cpp | 9 +- 4 files changed, 356 insertions(+), 8 deletions(-) create mode 100644 include/builder/disk_backed_offsets_builder.hpp diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp index 29ad04e..6d30fb5 100644 --- a/include/builder/dictionary_builder.hpp +++ b/include/builder/dictionary_builder.hpp @@ -5,6 +5,7 @@ #include "include/offsets.hpp" #include "include/builder/util.hpp" #include "include/builder/disk_backed_strings.hpp" +#include "include/builder/disk_backed_offsets_builder.hpp" #include "include/builder/streaming_save.hpp" #include "include/buckets_statistics.hpp" @@ -20,7 +21,10 @@ struct dictionary_builder // , strings_run_id(pthash::clock_type::now().time_since_epoch().count()) , total_time_musec(0) {} - ~dictionary_builder() { strings_builder.remove_file(); } + ~dictionary_builder() { + strings_builder.remove_file(); + strings_offsets_builder.remove_file(); + } /* Build a query-ready dictionary in `d`. After this returns, @@ -59,7 +63,7 @@ struct dictionary_builder // build_configuration build_config; uint64_t num_kmers; minimizers_tuples minimizers; - typename Offsets::builder strings_offsets_builder; + disk_backed_offsets_builder strings_offsets_builder; disk_backed_strings strings_builder; weights::builder weights_builder; @@ -88,15 +92,20 @@ struct dictionary_builder // total_time_musec = 0; { - std::stringstream ss; - ss << build_config.tmp_dirname << "/sshash.tmp.run_" << strings_run_id - << ".strings.bin"; - strings_builder.open_for_writing(ss.str()); + std::stringstream ss_strings; + ss_strings << build_config.tmp_dirname << "/sshash.tmp.run_" << strings_run_id + << ".strings.bin"; + strings_builder.open_for_writing(ss_strings.str()); + std::stringstream ss_offsets; + ss_offsets << build_config.tmp_dirname << "/sshash.tmp.run_" << strings_run_id + << ".strings_offsets.bin"; + strings_offsets_builder.open_for_writing(ss_offsets.str()); } do_step("step 1 (encode strings)", [&]() { encode_strings(filename); strings_builder.freeze(); + strings_offsets_builder.freeze(); d.m_num_kmers = num_kmers; assert(strings_offsets_builder.size() >= 2); d.m_num_strings = strings_offsets_builder.size() - 1; diff --git a/include/builder/disk_backed_offsets_builder.hpp b/include/builder/disk_backed_offsets_builder.hpp new file mode 100644 index 0000000..c86d33f --- /dev/null +++ b/include/builder/disk_backed_offsets_builder.hpp @@ -0,0 +1,327 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "include/offsets.hpp" + +namespace sshash { + +/* + A disk-backed drop-in for `Offsets::builder` that spills offset values + to a tmp file as they're appended, keeping only a small in-RAM write + buffer. RAM usage of the builder is bounded by the buffer size, + independently of the number of strings. + + Interface mirrors `decoded_offsets::builder` / `encoded_offsets::builder` + enough for the SSHash build path: + + reserve(n) no-op (kept for source compat) + push_back(val) append to file (buffered) + front() / back() / size() O(1), tracked separately + num_bytes() in-RAM footprint of the builder + set_num_bits(nb) stash bit-width metadata + num_bits_per_offset() dispatched per Offsets type + encode(offset, begin, sid) pure, dispatched per Offsets type + build(target) stream from disk into the final + compact / endpoints sequence + remove_file() cleanup + + Random-access `operator[]` is *not* supported. Callers that need to + walk a contiguous range of offsets must use a `reader`, which provides + forward-sequential reads via a small in-RAM buffer. +*/ +template +struct disk_backed_offsets_builder { + static_assert(std::is_same_v || + std::is_same_v, + "disk_backed_offsets_builder supports decoded_offsets and encoded_offsets"); + + static constexpr uint64_t default_writer_buffer_records = uint64_t(1) << 12; // 32 KiB + static constexpr uint64_t default_reader_buffer_records = uint64_t(1) << 12; // 32 KiB + + disk_backed_offsets_builder() = default; + disk_backed_offsets_builder(disk_backed_offsets_builder const&) = delete; + disk_backed_offsets_builder& operator=(disk_backed_offsets_builder const&) = delete; + + void open_for_writing(std::string const& filename, + uint64_t writer_buffer_records = default_writer_buffer_records) { + m_filename = filename; + m_writer_buffer_capacity = std::max(1, writer_buffer_records); + m_buf.clear(); + m_buf.reserve(m_writer_buffer_capacity); + m_size = 0; + m_front = 0; + m_back = 0; + m_have_front = false; + m_frozen = false; + m_writer.open(m_filename, std::ofstream::binary | std::ofstream::trunc); + if (!m_writer.is_open()) { + throw std::runtime_error("cannot open offsets tmp file '" + m_filename + "'"); + } + } + + /* No-op: kept for source-compatibility with the in-RAM builder. */ + void reserve(uint64_t /*n*/) {} + + void push_back(uint64_t val) { + if (!m_have_front) { + m_front = val; + m_have_front = true; + } + m_back = val; + m_buf.push_back(val); + ++m_size; + if (m_buf.size() >= m_writer_buffer_capacity) flush_buffer(); + } + + /* Finish writing: flush the in-RAM buffer and close the writer. */ + void freeze() { + if (m_frozen) return; + flush_buffer(); + if (m_writer.is_open()) m_writer.close(); + m_frozen = true; + } + + uint64_t size() const { return m_size; } + uint64_t front() const { return m_front; } + uint64_t back() const { return m_back; } + std::string const& filename() const { return m_filename; } + + /* In-RAM footprint of the builder (excluding the on-disk file). */ + uint64_t num_bytes() const { + return sizeof(m_nb) + m_buf.capacity() * sizeof(uint64_t); + } + + void set_num_bits(num_bits nb) { m_nb = nb; } + + uint64_t num_bits_per_offset() const { + if constexpr (std::is_same_v) { + return m_nb.per_absolute_offset; + } else { + return m_nb.per_string_id + m_nb.per_relative_offset; + } + } + + /* Pure: matches `decoded_offsets::builder::encode` / + `encoded_offsets::builder::encode`. Safe to call concurrently from + multiple threads (depends only on m_nb and m_size, both of which + are stable while compute_minimizer_tuples runs). */ + uint64_t encode(uint64_t offset, uint64_t begin, uint64_t string_id) const { + if constexpr (std::is_same_v) { + (void)begin; + (void)string_id; + return offset; + } else { + assert(string_id < m_size); + assert(offset >= begin); + assert((offset - begin) < (uint64_t(1) << m_nb.per_relative_offset)); + uint64_t relative_offset = offset - begin; + return (string_id << m_nb.per_relative_offset) + relative_offset; + } + } + + /* + Forward-sequential reader over the offsets file. Each thread in + compute_minimizer_tuples should construct one for its assigned + index range; per-thread RAM footprint is the buffer size only. + */ + struct reader { + reader() = default; + reader(reader const&) = delete; + reader& operator=(reader const&) = delete; + reader(reader&&) = default; + reader& operator=(reader&&) = default; + + /* Open the file and seek so that the next `next()` call returns + `*(values + start_index)`. */ + void open(std::string const& filename, uint64_t start_index, + uint64_t buffer_records = default_reader_buffer_records) { + m_buf.assign(std::max(1, buffer_records), 0); + m_pos = 0; + m_size = 0; + m_in.open(filename, std::ifstream::binary); + if (!m_in.is_open()) { + throw std::runtime_error("cannot open offsets tmp file '" + filename + "'"); + } + m_in.seekg(static_cast(start_index * sizeof(uint64_t)), + std::ios::beg); + refill(); + } + + void close() { + if (m_in.is_open()) m_in.close(); + m_buf.clear(); + m_buf.shrink_to_fit(); + m_pos = 0; + m_size = 0; + } + + /* Return the next offset and advance. Caller must ensure they + don't read past the end of the file. */ + uint64_t next() { + if (m_pos >= m_size) refill(); + assert(m_pos < m_size); + return m_buf[m_pos++]; + } + + private: + std::ifstream m_in; + std::vector m_buf; + uint64_t m_pos = 0; + uint64_t m_size = 0; + + void refill() { + m_pos = 0; + m_in.read(reinterpret_cast(m_buf.data()), + static_cast(m_buf.size() * sizeof(uint64_t))); + const std::streamsize got = m_in.gcount(); + m_size = static_cast(got) / sizeof(uint64_t); + if (m_size == 0) { + throw std::runtime_error("disk_backed_offsets_builder: read past end of file"); + } + } + }; + + /* Construct a reader positioned at `start_index`. Requires freeze(). */ + reader make_reader(uint64_t start_index, + uint64_t buffer_records = default_reader_buffer_records) const { + if (!m_frozen) { + throw std::runtime_error( + "disk_backed_offsets_builder: must freeze() before make_reader()"); + } + reader r; + r.open(m_filename, start_index, buffer_records); + return r; + } + + /* + A copyable forward iterator over the entire offsets file, suitable + for the `Iterator`-template `encode` / `build` calls in + `bits::endpoints_sequence` and `bits::compact_vector`. Holds the + underlying ifstream via shared_ptr so the iterator can be copied + (those APIs may copy the iterator internally). + */ + struct full_iterator { + using iterator_category = std::forward_iterator_tag; + using value_type = uint64_t; + using difference_type = std::ptrdiff_t; + using reference = uint64_t const&; + using pointer = uint64_t const*; + + full_iterator() = default; + + void open(std::string const& filename, + uint64_t buffer_records = default_reader_buffer_records) { + m_state = std::make_shared(); + m_state->buf.assign(std::max(1, buffer_records), 0); + m_state->in.open(filename, std::ifstream::binary); + if (!m_state->in.is_open()) { + throw std::runtime_error("cannot open offsets tmp file '" + filename + "'"); + } + m_state->refill(); + } + + uint64_t operator*() const { + assert(m_state && m_state->pos < m_state->size); + return m_state->buf[m_state->pos]; + } + full_iterator& operator++() { + assert(m_state); + ++m_state->pos; + if (m_state->pos >= m_state->size && !m_state->eof) m_state->refill(); + return *this; + } + + private: + struct state { + std::ifstream in; + std::vector buf; + uint64_t pos = 0; + uint64_t size = 0; + bool eof = false; + void refill() { + pos = 0; + in.read(reinterpret_cast(buf.data()), + static_cast(buf.size() * sizeof(uint64_t))); + const std::streamsize got = in.gcount(); + size = static_cast(got) / sizeof(uint64_t); + if (size == 0) eof = true; + } + }; + std::shared_ptr m_state; + }; + + /* + Stream the offsets file into the target Offsets structure (mirrors + the in-RAM builder's `build`). After return, the file is removed + and `size()` resets to 0 to match the in-RAM builder, which clears + its m_v in `build`. + */ + void build(Offsets& target) { + if (!m_frozen) freeze(); + if (m_size == 0) { + remove_file(); + reset_state(); + return; + } + + if constexpr (std::is_same_v) { + full_iterator it; + it.open(m_filename); + target.m_seq.encode(it, m_size, m_back); + } else { + full_iterator it; + it.open(m_filename); + target.m_seq.build(it, m_size, m_nb.per_absolute_offset); + target.m_num_bits_per_relative_offset = m_nb.per_relative_offset; + } + + remove_file(); + reset_state(); + } + + /* Remove the on-disk tmp file (if any). */ + void remove_file() { + if (m_writer.is_open()) m_writer.close(); + if (!m_filename.empty()) std::remove(m_filename.c_str()); + } + +private: + std::string m_filename; + std::ofstream m_writer; + std::vector m_buf; + uint64_t m_writer_buffer_capacity = default_writer_buffer_records; + uint64_t m_size = 0; + uint64_t m_front = 0; + uint64_t m_back = 0; + bool m_have_front = false; + bool m_frozen = false; + num_bits m_nb; + + void flush_buffer() { + if (m_buf.empty()) return; + m_writer.write(reinterpret_cast(m_buf.data()), + static_cast(m_buf.size() * sizeof(uint64_t))); + m_buf.clear(); + } + + void reset_state() { + m_size = 0; + m_buf.clear(); + m_buf.shrink_to_fit(); + m_have_front = false; + m_front = 0; + m_back = 0; + m_frozen = false; + } +}; + +} // namespace sshash diff --git a/include/offsets.hpp b/include/offsets.hpp index e718ed3..b592e7b 100644 --- a/include/offsets.hpp +++ b/include/offsets.hpp @@ -5,6 +5,8 @@ namespace sshash { +template struct disk_backed_offsets_builder; + struct num_bits { num_bits() : per_absolute_offset(0), per_relative_offset(0), per_string_id(0) {} uint64_t per_absolute_offset; @@ -101,6 +103,11 @@ struct offsets // visit_impl(visitor, *this); } + /* Allow disk_backed_offsets_builder to populate m_seq directly via a + streaming forward iterator (mirroring what `Seq`'s nested builder + does, but with on-disk values). */ + template friend struct disk_backed_offsets_builder; + protected: Seq m_seq; uint64_t m_num_bits_per_relative_offset; diff --git a/src/builder/compute_minimizer_tuples.cpp b/src/builder/compute_minimizer_tuples.cpp index 8458857..a3e98ec 100644 --- a/src/builder/compute_minimizer_tuples.cpp +++ b/src/builder/compute_minimizer_tuples.cpp @@ -49,14 +49,19 @@ void dictionary_builder::compute_minimizer_tuples() // auto strings_reader = strings_builder.make_reader(); kmer_iterator kmer_it(strings_reader, k); + /* Per-thread forward reader over the offsets file, positioned + so the first `next()` returns offsets[index_begin]. */ + auto offsets_reader = strings_offsets_builder.make_reader(index_begin); + uint64_t prev_offset = offsets_reader.next(); // == offsets[index_begin] hasher_type hasher(build_config.seed); minimizer_iterator minimizer_it(k, m, hasher); minimizer_iterator_rc minimizer_it_rc(k, m, hasher); for (uint64_t i = index_begin; i < index_end; ++i) // { - const uint64_t begin = strings_offsets_builder[i]; - const uint64_t end = strings_offsets_builder[i + 1]; + const uint64_t begin = prev_offset; + const uint64_t end = offsets_reader.next(); // offsets[i + 1] + prev_offset = end; const uint64_t sequence_len = end - begin; assert(sequence_len >= k); From 27c71e8f1d8929a2c739492da63c06a913f2ce19 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 5 May 2026 16:28:01 +0000 Subject: [PATCH 11/32] spill the step-7 compact_vectors to disk; concatenate at save The four largest in-RAM "stored output" structures during the build are the sparse-index compact_vectors (control_codewords, mid_load_buckets, heavy_load_buckets) and the per-skew-partition cvb_positions. For huge inputs these can each be many GB. This commit makes the build never materialize them in RAM: - new streaming_compact_vector_writer: writes the same byte layout as bits::compact_vector::visit_impl (size, width, mask, owning_span) directly to a file via a 2-word rolling window, accepting set(index, value) in monotonic index order. Matches the +1 padding word that the in-RAM builder allocates so on-disk bytes are identical. - control_codewords: indices are mphf(minimizer), and the merged minimizers file is sorted by mphf hash, so writes during the combined pass are strictly monotonic. Streamed directly via the writer; no external sort. - heavy_load_buckets: single monotone cursor, also streamed directly via the writer. - mid_load_buckets: per-size cursors interleave across the pass, so each size class's positions are written to its own raw-uint64 tmp file (monotonic within size). After the combined pass, the per-size files are streamed in size order through a streaming_compact_vector_writer to assemble the final mid_load_buckets file. - cvb_positions per skew partition: writes are random by F(kmer). After the partition's MPHF is built, a second pass over the partition's kmer file emits (F(kmer), pos_in_bucket) tuples through the existing parallel_sort + flush + file_merging_iterator external-sort machinery; the sorted stream feeds a per-partition streaming_compact_vector_writer. The streaming saver is extended to take a substitution map keyed by `bits::compact_vector const*`. dictionary_builder populates the dictionary's compact_vector slots with empty placeholders, takes their addresses, and registers each spilled tmp file. The save pass copies bytes from each tmp file at the matching visit slot. For the materializing `build()` flow (used by --check), a new `materialize_spilled_into(d)` step re-loads each spilled tmp file back into the dictionary's in-RAM compact_vectors via essentials::loader, so queries work afterward. This brings the RAM peak back briefly at the very end (acceptable since --check inherently needs the full index in RAM). `build_streaming_save()` never materializes; the spilled tmp files are concatenated into the output by the saver and then removed. Verified byte-identical output vs the previous commit on salmonella_enterica m=7, plus full --check on regular, --canonical, multi-thread (-t 4), and --weighted, plus a streaming-save round-trip (sshash check + sshash query) with all five "EVERYTHING OK!" suites and 100% positive matches. No tmp file leaks. Remaining proportional-to-input items in RAM during build: the codewords MPHF (step 4) and the per-skew-partition MPHFs (step 7.2 phase C). pthash returns these as in-memory structs; spilling them would require pthash changes or an intermediate save/load step. Everything else is now bounded by the explicit --ram-limit. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- include/builder/dictionary_builder.hpp | 116 +++++- .../streaming_compact_vector_writer.hpp | 144 ++++++++ include/builder/streaming_save.hpp | 68 +++- src/builder/build_sparse_and_skew_index.cpp | 341 +++++++++++++----- 4 files changed, 552 insertions(+), 117 deletions(-) create mode 100644 include/builder/streaming_compact_vector_writer.hpp diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp index 6d30fb5..7e4af93 100644 --- a/include/builder/dictionary_builder.hpp +++ b/include/builder/dictionary_builder.hpp @@ -1,16 +1,52 @@ #pragma once +#include + #include "essentials.hpp" #include "include/dictionary.hpp" #include "include/offsets.hpp" #include "include/builder/util.hpp" #include "include/builder/disk_backed_strings.hpp" #include "include/builder/disk_backed_offsets_builder.hpp" +#include "include/builder/streaming_compact_vector_writer.hpp" #include "include/builder/streaming_save.hpp" #include "include/buckets_statistics.hpp" namespace sshash { +/* + Helper: load a serialized bits::compact_vector back from a tmp file + into the given in-RAM compact_vector. Used by the materializing build + flow (after step 7) so that --check / queries can run. +*/ +inline void materialize_compact_vector_from_file(bits::compact_vector& cv, + std::string const& filename) { + essentials::loader loader(filename.c_str()); + loader.visit(cv); +} + +/* + Tmp file paths for the compact_vectors that step 7 spills to disk. + Populated by build_sparse_and_skew_index; consumed by step 8 (either + materialized back into RAM for `build()`, or injected into the output + by `build_streaming_save()`). +*/ +struct spilled_components { + std::string control_codewords_path; + std::string mid_load_buckets_path; + std::string heavy_load_buckets_path; + std::vector skew_positions_paths; // one entry per skew partition + + void clear_files() { + if (!control_codewords_path.empty()) std::remove(control_codewords_path.c_str()); + if (!mid_load_buckets_path.empty()) std::remove(mid_load_buckets_path.c_str()); + if (!heavy_load_buckets_path.empty()) std::remove(heavy_load_buckets_path.c_str()); + for (auto const& p : skew_positions_paths) { + if (!p.empty()) std::remove(p.c_str()); + } + } +}; + template struct dictionary_builder // { @@ -24,29 +60,32 @@ struct dictionary_builder // ~dictionary_builder() { strings_builder.remove_file(); strings_offsets_builder.remove_file(); + spilled.clear_files(); } /* - Build a query-ready dictionary in `d`. After this returns, - `d.m_spss.strings` is materialized in RAM (peak briefly equals the - strings size). Use this when the caller needs to query `d` post-build - (e.g., `--check`). + Build a query-ready dictionary in `d`. After this returns, all + spilled components and `d.m_spss.strings` are materialized in RAM + (peak briefly equals the index size). Use this when the caller + needs to query `d` post-build (e.g., `--check`). */ void build(dictionary& d, std::string const& filename) { run_steps_1_through_7(d, filename); - do_step("step 8 (materialize strings to RAM)", [&]() { + do_step("step 8 (materialize spilled components to RAM)", [&]() { + materialize_spilled_into(d); strings_builder.load_into(d.m_spss.strings); strings_builder.remove_file(); + spilled.clear_files(); }); finalize_stats(d); } /* Build the dictionary and stream-save it to `output_filename` without - ever materializing `strings` in RAM. After this returns, `d` is *not* - query-ready (`d.m_spss.strings` is empty). Use this when the caller - only needs the on-disk index file and wants to keep peak RAM bounded - by the build phase. + ever materializing the spilled components or `strings` in RAM. + After this returns, `d` is *not* query-ready. Use this when the + caller only needs the on-disk index file and wants to keep peak RAM + bounded by the build phase. */ void build_streaming_save(dictionary& d, // std::string const& filename, // @@ -54,8 +93,35 @@ struct dictionary_builder // { run_steps_1_through_7(d, filename); do_step("step 8 (stream-save dictionary to disk)", [&]() { - save_streaming(d, output_filename.c_str(), &d.m_spss.strings, strings_builder); + /* Populate placeholder compact_vectors at the visit slots whose + byte content the saver will substitute from disk tmp files. */ + std::unordered_map subs; + if (!spilled.control_codewords_path.empty()) { + subs[&d.m_ssi.codewords.control_codewords] = spilled.control_codewords_path; + } + if (!spilled.mid_load_buckets_path.empty()) { + subs[&d.m_ssi.mid_load_buckets] = spilled.mid_load_buckets_path; + } + if (!spilled.heavy_load_buckets_path.empty()) { + subs[&d.m_ssi.ski.heavy_load_buckets] = spilled.heavy_load_buckets_path; + } + /* skew positions: populate the owning_span with placeholders so + the visit walks the right number of entries and we can take + their addresses for substitution. */ + const std::size_t num_part = spilled.skew_positions_paths.size(); + if (num_part > 0) { + std::vector placeholders(num_part); + d.m_ssi.ski.positions = std::move(placeholders); + for (std::size_t i = 0; i != num_part; ++i) { + if (!spilled.skew_positions_paths[i].empty()) { + subs[&d.m_ssi.ski.positions[i]] = spilled.skew_positions_paths[i]; + } + } + } + save_streaming(d, output_filename.c_str(), &d.m_spss.strings, strings_builder, + std::move(subs)); strings_builder.remove_file(); + spilled.clear_files(); }); finalize_stats(d); } @@ -66,6 +132,7 @@ struct dictionary_builder // disk_backed_offsets_builder strings_offsets_builder; disk_backed_strings strings_builder; weights::builder weights_builder; + spilled_components spilled; uint64_t strings_run_id; @@ -74,6 +141,35 @@ struct dictionary_builder // uint64_t total_time_musec; private: + /* Load each spilled compact_vector tmp file back into the corresponding + in-RAM compact_vector inside `d`. Used by the materializing build + flow so queries can run against `d` (e.g., during --check). */ + void materialize_spilled_into(dictionary& d) { + if (!spilled.control_codewords_path.empty()) { + materialize_compact_vector_from_file(d.m_ssi.codewords.control_codewords, + spilled.control_codewords_path); + } + if (!spilled.mid_load_buckets_path.empty()) { + materialize_compact_vector_from_file(d.m_ssi.mid_load_buckets, + spilled.mid_load_buckets_path); + } + if (!spilled.heavy_load_buckets_path.empty()) { + materialize_compact_vector_from_file(d.m_ssi.ski.heavy_load_buckets, + spilled.heavy_load_buckets_path); + } + const std::size_t num_part = spilled.skew_positions_paths.size(); + if (num_part > 0) { + std::vector positions_vec(num_part); + for (std::size_t i = 0; i != num_part; ++i) { + if (!spilled.skew_positions_paths[i].empty()) { + materialize_compact_vector_from_file(positions_vec[i], + spilled.skew_positions_paths[i]); + } + } + d.m_ssi.ski.positions = std::move(positions_vec); + } + } + void run_steps_1_through_7(dictionary& d, std::string const& filename) { d.m_k = build_config.k; d.m_m = build_config.m; diff --git a/include/builder/streaming_compact_vector_writer.hpp b/include/builder/streaming_compact_vector_writer.hpp new file mode 100644 index 0000000..ea9dce1 --- /dev/null +++ b/include/builder/streaming_compact_vector_writer.hpp @@ -0,0 +1,144 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace sshash { + +/* + Streams a `bits::compact_vector` to disk one entry at a time, accepting + `set(index, value)` calls in monotonically non-decreasing index order + (gaps are filled with zero, matching the default-zero semantics of an + in-RAM compact_vector::builder). + + The on-disk byte layout matches `bits::compact_vector::visit_impl`: + uint64_t m_size + uint64_t m_width + uint64_t m_mask + size_t n (= ceil(m_size * m_width / 64)) + uint64_t m_data[n] (little-endian bit-packed) + + Total RAM footprint: a 2-word rolling window plus the std::ofstream's + own buffer. Independently of the number of entries. +*/ +struct streaming_compact_vector_writer { + streaming_compact_vector_writer() = default; + streaming_compact_vector_writer(streaming_compact_vector_writer const&) = delete; + streaming_compact_vector_writer& operator=(streaming_compact_vector_writer const&) = delete; + + void open(std::string const& filename, uint64_t num_entries, uint64_t width) { + if (width == 0) throw std::runtime_error("streaming_compact_vector_writer: width must be > 0"); + if (width > 64) throw std::runtime_error("streaming_compact_vector_writer: width must be <= 64"); + m_filename = filename; + m_num_entries = num_entries; + m_width = width; + /* Match `bits::compact_vector::builder`'s data layout, which + allocates `words_for(size*width) + 1` words: the trailing + padding word allows in-RAM `set_bits` to write across word + boundaries without bounds checking and is part of the + serialized `m_data` owning_span. */ + const uint64_t packed_words = (num_entries == 0) ? 0 : (num_entries * width + 63) / 64; + m_total_words = packed_words + 1; + m_words_written = 0; + m_buf[0] = 0; + m_buf[1] = 0; + m_have_last_index = false; + + m_out.open(filename, std::ofstream::binary | std::ofstream::trunc); + if (!m_out.is_open()) { + throw std::runtime_error("cannot open compact_vector tmp file '" + filename + "'"); + } + + /* Header (matches bits::compact_vector::visit_impl). */ + write_pod(m_num_entries); + write_pod(m_width); + const uint64_t mask = (m_width == 64) ? uint64_t(-1) : ((uint64_t(1) << m_width) - 1); + write_pod(mask); + const std::size_t n = static_cast(m_total_words); + write_pod(n); + } + + /* Write a value at position `index`. Successive calls must satisfy + `index >= previous_index`; gaps are filled with zero. */ + void set(uint64_t index, uint64_t value) { + if (m_have_last_index) { + assert(index >= m_last_index); + } + m_have_last_index = true; + m_last_index = index; + + const uint64_t bit_offset = index * m_width; + const uint64_t word_index = bit_offset / 64; + const uint64_t bit_in_word = bit_offset % 64; + + /* Slide the 2-word window forward to cover word_index. Words below + are now finalized; emit them. */ + while (m_words_written < word_index) { + write_word(m_buf[0]); + m_buf[0] = m_buf[1]; + m_buf[1] = 0; + ++m_words_written; + } + + /* OR `value` (m_width low bits of it) into the window starting at + bit_in_word of m_buf[0]; overflow goes into m_buf[1]. */ + const uint64_t fits_in_word_0 = 64 - bit_in_word; + if (m_width <= fits_in_word_0) { + if (m_width == 64) { + /* bit_in_word must be 0 here */ + m_buf[0] = value; + } else { + m_buf[0] |= value << bit_in_word; + } + } else { + m_buf[0] |= value << bit_in_word; + m_buf[1] |= value >> fits_in_word_0; + } + } + + /* Flush remaining buffered words and close the file. */ + void finalize() { + while (m_words_written < m_total_words) { + write_word(m_buf[0]); + m_buf[0] = m_buf[1]; + m_buf[1] = 0; + ++m_words_written; + } + if (m_out.is_open()) m_out.close(); + } + + std::string const& filename() const { return m_filename; } + uint64_t num_entries() const { return m_num_entries; } + uint64_t width() const { return m_width; } + + void remove_file() { + if (m_out.is_open()) m_out.close(); + if (!m_filename.empty()) std::remove(m_filename.c_str()); + } + +private: + std::string m_filename; + std::ofstream m_out; + uint64_t m_num_entries = 0; + uint64_t m_width = 0; + uint64_t m_total_words = 0; + uint64_t m_words_written = 0; + uint64_t m_buf[2] = {0, 0}; + uint64_t m_last_index = 0; + bool m_have_last_index = false; + + template + void write_pod(T const& v) { + m_out.write(reinterpret_cast(&v), sizeof(T)); + } + void write_word(uint64_t w) { + m_out.write(reinterpret_cast(&w), sizeof(uint64_t)); + } +}; + +} // namespace sshash diff --git a/include/builder/streaming_save.hpp b/include/builder/streaming_save.hpp index 73e4315..9e9de1d 100644 --- a/include/builder/streaming_save.hpp +++ b/include/builder/streaming_save.hpp @@ -1,13 +1,16 @@ #pragma once +#include #include #include #include #include +#include #include #include "essentials.hpp" #include "external/pthash/external/bits/include/bit_vector.hpp" +#include "external/pthash/external/bits/include/compact_vector.hpp" #include "include/builder/disk_backed_strings.hpp" @@ -17,17 +20,25 @@ namespace sshash { A saver that mirrors `essentials::generic_saver`, except that any visit to a specific `bits::bit_vector` instance (identified by address) is redirected to `disk_backed_strings::save_to`, which streams the strings - bytes from the on-disk tmp file. All other visits go through the regular - `essentials` path. + bytes from the on-disk tmp file. Likewise, visits to `bits::compact_vector` + instances whose addresses appear in `compact_vector_subs` are replaced + with byte-for-byte streaming from the corresponding tmp file (which is + expected to be in `bits::compact_vector::visit_impl`'s on-disk format). - Using address-based identification means we don't need to add any - intermediate type or marker to `bits::bit_vector` itself. + Address-based identification means we don't need to add any intermediate + type or marker to bits::bit_vector / bits::compact_vector themselves. */ struct streaming_strings_saver { - streaming_strings_saver(std::ostream& os, // - bits::bit_vector const* strings_addr, // - disk_backed_strings const* strings_storage) // - : m_os(os), m_strings_addr(strings_addr), m_strings_storage(strings_storage) { + streaming_strings_saver( + std::ostream& os, // + bits::bit_vector const* strings_addr, // + disk_backed_strings const* strings_storage, // + std::unordered_map compact_vector_subs // + ) + : m_os(os) + , m_strings_addr(strings_addr) + , m_strings_storage(strings_storage) + , m_compact_vector_subs(std::move(compact_vector_subs)) { if (m_strings_addr == nullptr || m_strings_storage == nullptr) { throw std::runtime_error("streaming_strings_saver requires non-null arguments"); } @@ -41,6 +52,13 @@ struct streaming_strings_saver { return; } } + if constexpr (std::is_same_v) { + auto it = m_compact_vector_subs.find(&val); + if (it != m_compact_vector_subs.end()) { + stream_file_into_os(it->second); + return; + } + } if constexpr (essentials::is_pod::value) { essentials::save_pod(m_os, val); } else { @@ -64,6 +82,7 @@ struct streaming_strings_saver { std::ostream& m_os; bits::bit_vector const* m_strings_addr; disk_backed_strings const* m_strings_storage; + std::unordered_map m_compact_vector_subs; template void visit_seq(Vec const& vec) { @@ -77,23 +96,42 @@ struct streaming_strings_saver { for (auto const& v : vec) visit(v); } } + + void stream_file_into_os(std::string const& filename) { + std::ifstream in(filename, std::ifstream::binary); + if (!in.is_open()) { + throw std::runtime_error("cannot open spilled component file '" + filename + "'"); + } + char buf[64 * 1024]; + while (in.good()) { + in.read(buf, sizeof(buf)); + const std::streamsize got = in.gcount(); + if (got > 0) m_os.write(buf, got); + } + in.close(); + } }; /* - Save `t` to `filename`, streaming any embedded `bits::bit_vector` whose - address matches `strings_addr` from `strings_storage` instead of from - RAM. Other fields are saved using the standard `essentials` path. + Save `t` to `filename`. Any embedded bits::bit_vector matching + `strings_addr` is streamed from `strings_storage`; any embedded + bits::compact_vector whose address appears in `compact_vector_subs` + has its bytes copied from the corresponding tmp file. Other fields are + saved via the standard essentials path. */ template -void save_streaming(T const& t, char const* filename, // - bits::bit_vector const* strings_addr, // - disk_backed_strings const& strings_storage) // +void save_streaming(T const& t, char const* filename, // + bits::bit_vector const* strings_addr, // + disk_backed_strings const& strings_storage, // + std::unordered_map // + compact_vector_subs = {}) // { std::ofstream out(filename, std::ios::binary); if (!out.good()) { throw std::runtime_error(std::string("error opening file '") + filename + "' for writing"); } - streaming_strings_saver saver(out, strings_addr, &strings_storage); + streaming_strings_saver saver(out, strings_addr, &strings_storage, + std::move(compact_vector_subs)); saver.visit(t); out.close(); } diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp index de7c963..3e9851d 100644 --- a/src/builder/build_sparse_and_skew_index.cpp +++ b/src/builder/build_sparse_and_skew_index.cpp @@ -1,4 +1,7 @@ +#include + #include "include/buckets_statistics.hpp" +#include "include/builder/streaming_compact_vector_writer.hpp" namespace sshash { @@ -36,6 +39,26 @@ struct kmer_extraction_request { }; #pragma pack(pop) +/* + A (mphf_pos, pos_in_bucket) record used to spill the per-skew-partition + `cvb_positions` to disk. We external-sort these by mphf_pos so the + streaming compact_vector writer can pack the final cvb_positions file in + a single forward pass. +*/ +#pragma pack(push, 4) +struct position_tuple { + position_tuple() {} + position_tuple(uint64_t mphf_pos, uint32_t pib) : mphf_pos(mphf_pos), pib(pib) {} + + bool operator<(position_tuple const& o) const { return mphf_pos < o.mphf_pos; } + bool operator>(position_tuple const& o) const { return mphf_pos > o.mphf_pos; } + static position_tuple max() { return {uint64_t(-1), uint32_t(-1)}; } + + uint64_t mphf_pos; + uint32_t pib; +}; +#pragma pack(pop) + /* Forward iterator over a per-skew-partition tmp file produced by step 7.2 phase (B). Each record is `(kmer.bits, uint32_t pos_in_bucket)`. @@ -157,8 +180,6 @@ void dictionary_builder::build_sparse_and_skew_index( Calculate bits needed for control codewords encoding. Encoding format: ((list_id << min_l) | (bucket_size - 2)) << 2 | status_code - We need: 2 bits (status) + min_l bits (bucket_size) + bits for list_id. - list_id is bounded by the maximum number of buckets sharing the same size. */ const uint64_t bits_for_list_id = std::ceil(std::log2(buckets_stats.max_sparse_buckets_per_size() + 1)); @@ -181,7 +202,7 @@ void dictionary_builder::build_sparse_and_skew_index( } else if (max_bucket_size < (1ULL << constants::max_l)) { num_partitions = log2_max_bucket_size - constants::min_l; } - assert(num_partitions <= 8); // so that we need 3 bits to encode a partition_id + assert(num_partitions <= 8); if (build_config.verbose) { std::cout << "num_buckets_larger_than_1_not_in_skew_index " @@ -197,51 +218,72 @@ void dictionary_builder::build_sparse_and_skew_index( std::cout << "max_bucket_size " << max_bucket_size << std::endl; std::cout << "log2_max_bucket_size " << log2_max_bucket_size << std::endl; std::cout << "num_partitions in skew index " << num_partitions << std::endl; - std::cout << "num_minimizer_positions_of_buckets_larger_than_1 " - << num_minimizer_positions_of_buckets_larger_than_1 << "/" - << num_minimizer_positions << " (" - << (num_minimizer_positions_of_buckets_larger_than_1 * 100.0) / - num_minimizer_positions - << "%)" << std::endl; - std::cout << "num_minimizer_positions_of_buckets_in_skew_index " - << num_minimizer_positions_of_buckets_in_skew_index << "/" - << num_minimizer_positions << " (" - << (num_minimizer_positions_of_buckets_in_skew_index * 100.0) / - num_minimizer_positions - << "%)" << std::endl; } - /* Materialize strings_offsets now: needed below to decode pos_in_seq - into absolute offsets when emitting heavy-bucket kmer requests. - `d.m_spss.strings` is materialized later in step 8 (or stream-saved - directly to disk). */ + /* Materialize strings_offsets now (it's needed below to decode + pos_in_seq into absolute offsets when emitting heavy-bucket kmer + requests). `d.m_spss.strings` is materialized later (step 8) or + stream-saved directly. */ strings_offsets_builder.build(d.m_spss.strings_offsets); /* Precompute the layout of mid_load_buckets from the bucket-size histogram. begin_buckets_of_size[s] is the start offset (in - positions, not bits) of size-s bucket positions in mid_load_buckets; - it lets us write each bucket's positions in place during the - single-pass build, without needing to sort buckets by size. */ + positions, not bits) of size-s bucket positions in mid_load_buckets. */ std::vector begin_buckets_of_size(min_size + 1, 0); for (uint64_t s = 3; s <= min_size; ++s) { begin_buckets_of_size[s] = static_cast( // begin_buckets_of_size[s - 1] + buckets_stats.num_buckets_of_size(s - 1) * (s - 1)); } + d.m_ssi.begin_buckets_of_size = std::move(begin_buckets_of_size); + + /* All step-7.1 outputs are spilled to disk; the in-RAM dictionary + fields stay empty (they're populated later either from disk for + --check or substituted by the streaming saver). */ + const uint64_t step7_run_id = pthash::clock_type::now().time_since_epoch().count(); + auto step7_path = [&](std::string const& tag) { + std::stringstream ss; + ss << build_config.tmp_dirname << "/sshash.tmp.run_" << step7_run_id << "." << tag + << ".bin"; + return ss.str(); + }; + + spilled.control_codewords_path = step7_path("control_codewords"); + spilled.mid_load_buckets_path = step7_path("mid_load_buckets"); + spilled.heavy_load_buckets_path = step7_path("heavy_load_buckets"); + + /* Streaming writers for the two compact_vectors that get strictly + monotonic indices during the combined pass (control_codewords: + indexed by bucket_id == mphf hash, monotonic across buckets in + file order; heavy_load_buckets: indexed by a single monotone + cursor advanced inside the heavy branch). */ + streaming_compact_vector_writer control_codewords_writer; + control_codewords_writer.open(spilled.control_codewords_path, num_minimizers, + num_bits_for_control); + streaming_compact_vector_writer heavy_load_writer; + heavy_load_writer.open(spilled.heavy_load_buckets_path, + num_minimizer_positions_of_buckets_in_skew_index, num_bits_per_offset); + + /* mid_load: per-size tmp files of raw uint64_t positions. Each file is + written monotonically within its size class. After the combined + pass we stream them in size order through a streaming + compact_vector writer to assemble the final mid_load_buckets file. */ + auto mid_load_per_size_path = [&](uint64_t s) { + std::stringstream ss; + ss << build_config.tmp_dirname << "/sshash.tmp.run_" << step7_run_id + << ".mid_load_size_" << s << ".bin"; + return ss.str(); + }; + std::vector mid_load_per_size(min_size + 1); + for (uint64_t s = 2; s <= min_size; ++s) { + if (buckets_stats.num_buckets_of_size(s) == 0) continue; + mid_load_per_size[s].open(mid_load_per_size_path(s), + std::ofstream::binary | std::ofstream::trunc); + if (!mid_load_per_size[s].is_open()) { + throw std::runtime_error("cannot open mid_load per-size tmp file"); + } + } - bits::compact_vector::builder control_codewords_builder; - bits::compact_vector::builder mid_load_buckets_builder; - bits::compact_vector::builder heavy_load_buckets_builder; - control_codewords_builder.resize(num_minimizers, num_bits_for_control); - mid_load_buckets_builder.resize(num_minimizer_positions_of_buckets_larger_than_1, - num_bits_per_offset); - heavy_load_buckets_builder.resize(num_minimizer_positions_of_buckets_in_skew_index, - num_bits_per_offset); - - /* Per-size cursor for mid_load (initialized to begin_buckets_of_size) - and per-size list_id counter; monotone cursor for heavy_load. */ - std::vector mid_load_cursor(min_size + 1, 0); - for (uint64_t s = 2; s <= min_size; ++s) mid_load_cursor[s] = begin_buckets_of_size[s]; std::vector list_id_per_size(min_size + 1, 0); uint64_t heavy_load_cursor = 0; @@ -281,9 +323,6 @@ void dictionary_builder::build_sparse_and_skew_index( }); const uint64_t id = num_request_runs.fetch_add(1); const std::string fn = request_run_filename(id); - if (build_config.verbose) { - std::cout << "saving to file '" << fn << "'..." << std::endl; - } std::ofstream out(fn, std::ofstream::binary); if (!out.is_open()) throw std::runtime_error("cannot open file"); out.write(reinterpret_cast(request_buffer.data()), @@ -307,9 +346,15 @@ void dictionary_builder::build_sparse_and_skew_index( /* Combined pass: stream the merged minimizers file once and, per - bucket, write the appropriate part of the sparse index. For heavy - buckets we also emit kmer-extraction requests in-line (what was - formerly step 7.2 phase A). No mmap; no in-RAM `buckets` array. + bucket, write the appropriate part of the sparse index DIRECTLY TO + DISK via streaming compact_vector writers (control_codewords and + heavy_load_buckets) or per-size raw-value tmp files (mid_load). + For heavy buckets we also emit kmer-extraction requests in-line. + + Buckets are visited in mphf-hash (= bucket_id) order, so writes to + control_codewords are strictly monotonic. heavy_load_cursor is also + monotonic across the whole pass. mid_load per-size cursors are + each monotonic within their size class. */ { streaming_minimizer_bucket_reader reader; @@ -332,43 +377,43 @@ void dictionary_builder::build_sparse_and_skew_index( /* Singleton: code = |offset|0|, LSB = 0. */ const uint64_t code = bucket_buf.front().pos_in_seq << 1; assert(code < (uint64_t(1) << num_bits_for_control)); - control_codewords_builder.set(bucket_id, code); + control_codewords_writer.set(bucket_id, code); } else if (bucket_size <= min_size) { - /* Mid-load: write positions at the per-size cursor and - assign the next list_id for this size. */ + /* Mid-load: write positions to per-size raw file at the + per-size cursor; assign the next list_id for this size. */ const uint64_t list_id = list_id_per_size[bucket_size]++; const uint64_t code = (((list_id << constants::min_l) | (bucket_size - 2)) << 2) | 1; assert(code < (uint64_t(1) << num_bits_for_control)); - control_codewords_builder.set(bucket_id, code); + control_codewords_writer.set(bucket_id, code); - uint64_t cursor = mid_load_cursor[bucket_size]; + auto& out = mid_load_per_size[bucket_size]; uint64_t prev_pos_in_seq = constants::invalid_uint64; for (auto const& mt : bucket_buf) { if (mt.pos_in_seq != prev_pos_in_seq) { - mid_load_buckets_builder.set(cursor++, mt.pos_in_seq); + const uint64_t v = mt.pos_in_seq; + out.write(reinterpret_cast(&v), sizeof(uint64_t)); prev_pos_in_seq = mt.pos_in_seq; } } - mid_load_cursor[bucket_size] = cursor; } else { - /* Heavy: write positions at the monotone cursor, set the - codeword (encodes the start offset and partition id), - and emit kmer-extraction requests for each super-kmer - in the bucket. */ + /* Heavy: write positions at the monotone heavy_load_cursor, + set the codeword (encodes the start offset and partition + id), and emit kmer-extraction requests for each + super-kmer in the bucket. */ const uint64_t partition_id = partition_for_size(bucket_size); assert(partition_id < num_partitions); const uint64_t bucket_begin = heavy_load_cursor; const uint64_t code = (((bucket_begin << 3) | partition_id) << 2) | 3; assert(code < (uint64_t(1) << num_bits_for_control)); - control_codewords_builder.set(bucket_id, code); + control_codewords_writer.set(bucket_id, code); uint32_t pos_in_bucket = uint32_t(-1); uint64_t prev_pos_in_seq = constants::invalid_uint64; for (auto const& mt : bucket_buf) { num_kmers_in_partition[partition_id] += mt.num_kmers_in_super_kmer; if (mt.pos_in_seq != prev_pos_in_seq) { - heavy_load_buckets_builder.set(heavy_load_cursor++, mt.pos_in_seq); + heavy_load_writer.set(heavy_load_cursor++, mt.pos_in_seq); prev_pos_in_seq = mt.pos_in_seq; ++pos_in_bucket; } @@ -387,11 +432,47 @@ void dictionary_builder::build_sparse_and_skew_index( flush_request_buffer(); } - /* Build sparse-index structures into the dictionary. */ - d.m_ssi.begin_buckets_of_size = std::move(begin_buckets_of_size); - control_codewords_builder.build(d.m_ssi.codewords.control_codewords); - mid_load_buckets_builder.build(d.m_ssi.mid_load_buckets); - heavy_load_buckets_builder.build(d.m_ssi.ski.heavy_load_buckets); + /* Finalize the directly-streamed compact_vector files. */ + control_codewords_writer.finalize(); + heavy_load_writer.finalize(); + + /* Close per-size mid_load files. */ + for (uint64_t s = 2; s <= min_size; ++s) { + if (mid_load_per_size[s].is_open()) mid_load_per_size[s].close(); + } + + /* Concatenate per-size mid_load files in size order into the final + mid_load_buckets compact_vector file via the streaming writer. + Each per-size file holds raw uint64_t values written monotonically + within its size class; we just stream them through, packing into + num_bits_per_offset-bit fields at the precomputed begin offset for + each size. */ + { + streaming_compact_vector_writer mid_load_writer; + mid_load_writer.open(spilled.mid_load_buckets_path, + num_minimizer_positions_of_buckets_larger_than_1, + num_bits_per_offset); + uint64_t global_index = 0; + for (uint64_t s = 2; s <= min_size; ++s) { + const uint64_t expected = buckets_stats.num_buckets_of_size(s) * s; + if (expected == 0) continue; + std::ifstream in(mid_load_per_size_path(s), std::ifstream::binary); + if (!in.is_open()) { + throw std::runtime_error("cannot reopen mid_load per-size tmp file"); + } + for (uint64_t i = 0; i != expected; ++i) { + uint64_t v; + in.read(reinterpret_cast(&v), sizeof(uint64_t)); + if (in.gcount() != static_cast(sizeof(uint64_t))) { + throw std::runtime_error("mid_load per-size tmp file truncated"); + } + mid_load_writer.set(global_index++, v); + } + in.close(); + std::remove(mid_load_per_size_path(s).c_str()); + } + mid_load_writer.finalize(); + } timer.stop(); build_stats.add("step 7.1 (build sparse index)", uint64_t(timer.elapsed())); @@ -411,8 +492,8 @@ void dictionary_builder::build_sparse_and_skew_index( Phases (B) and (C) below; phase (A) was folded into the combined sparse pass above. Phase (B) extracts k-mers from `strings` in a single forward sweep guided by the externally-sorted requests, and - phase (C) builds the per-partition MPHF + positions in external - memory from the per-partition kmer files. + phase (C) builds the per-partition MPHF + cvb_positions on disk + from the per-partition kmer files. */ timer.start(); @@ -476,8 +557,6 @@ void dictionary_builder::build_sparse_and_skew_index( kmer = std::min(kmer, kmer_rc); } auto& w = partition_writers[req.partition_id]; - /* write only `kmer.bits` (avoids serializing the vptr that - `uint_kmer_t` carries due to its virtual destructor) */ w.write(reinterpret_cast(&kmer.bits), sizeof(kmer.bits)); w.write(reinterpret_cast(&req.pos_in_bucket), sizeof(req.pos_in_bucket)); @@ -496,27 +575,55 @@ void dictionary_builder::build_sparse_and_skew_index( } } - /* (C) per-partition MPHF + positions build */ + /* + (C) per-partition MPHF + cvb_positions build, both on disk. + + Per partition: + (1) Build MPHF in external memory by streaming the partition's + kmer file (pthash spills hashes to tmp_dirname under its own + ram budget). + (2) Stream-read the kmer file, compute F(kmer), emit + (F(kmer), pos_in_bucket) tuples to disk; external-sort by + F(kmer); stream sorted tuples through a + streaming_compact_vector_writer to produce the partition's + cvb_positions tmp file. + + Only the MPHF itself is held in RAM (pthash returns it as an + in-memory struct); cvb_positions is fully spilled. + */ { + spilled.skew_positions_paths.assign(num_partitions, std::string()); std::vector> mphfs; - std::vector positions; mphfs.resize(num_partitions); - positions.resize(num_partitions); pthash::build_configuration mphf_build_config; - mphf_build_config.lambda = - build_config.lambda + 2.0; /* Use higher lambda here since we have less keys. */ + mphf_build_config.lambda = build_config.lambda + 2.0; mphf_build_config.alpha = 0.94; mphf_build_config.seed = util::get_seed_for_hash_function(build_config); mphf_build_config.verbose = false; mphf_build_config.num_threads = build_config.num_threads; mphf_build_config.avg_partition_size = constants::avg_partition_size; - /* External-memory PHF: bound RAM by `--ram-limit` and spill hashes - to `tmp_dirname` rather than holding the partition's keys - (~16 B/kmer) and their hashes simultaneously in RAM. */ mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2; mphf_build_config.tmp_dir = build_config.tmp_dirname; + const uint64_t pos_run_basename_id = pthash::clock_type::now().time_since_epoch().count(); + auto pos_run_filename = [&](uint64_t partition_id, uint64_t id) { + std::stringstream ss; + ss << build_config.tmp_dirname << "/sshash.tmp.run_" << pos_run_basename_id + << ".pos_runs.p" << partition_id << "." << id << ".bin"; + return ss.str(); + }; + auto skew_positions_filename = [&](uint64_t partition_id) { + std::stringstream ss; + ss << build_config.tmp_dirname << "/sshash.tmp.run_" << pos_run_basename_id + << ".skew_positions.p" << partition_id << ".bin"; + return ss.str(); + }; + + const uint64_t pos_buffer_capacity = std::max( + uint64_t(1) << 16, + (build_config.ram_limit_in_GiB * essentials::GiB) / (4 * sizeof(position_tuple))); + uint64_t lower = min_size; uint64_t upper = 2 * lower; uint64_t num_bits_per_pos = constants::min_l + 1; @@ -537,7 +644,7 @@ void dictionary_builder::build_sparse_and_skew_index( if (n > 0) // { - const std::string fn = skew_partition_filename(partition_id); + const std::string kmer_fn = skew_partition_filename(partition_id); if (build_config.verbose) { const uint64_t avg_partition_size = @@ -552,14 +659,11 @@ void dictionary_builder::build_sparse_and_skew_index( << ")..." << std::endl; } - /* (1) Build the MPHF by streaming kmers from the partition - file. pthash's external-memory builder spills hashes - to tmp_dir under its own RAM budget; the iterator's - footprint is constant. */ + /* (1) Build the MPHF by streaming kmers from the partition file. */ auto& F = mphfs[partition_id]; { skew_partition_kmer_iterator iter; - iter.open(fn); + iter.open(kmer_fn); F.build_in_external_memory(iter, n, mphf_build_config); iter.close(); } @@ -570,14 +674,31 @@ void dictionary_builder::build_sparse_and_skew_index( << static_cast(F.num_bits()) / F.num_keys() << std::endl; } - /* (2) Re-stream the file to fill cvb_positions: for each - (kmer, pos_in_bucket), set cvb_positions[F(kmer)] = - pos_in_bucket. Only cvb_positions itself stays in RAM - (n * num_bits_per_pos bits, the actual stored output). */ - bits::compact_vector::builder cvb_positions; - cvb_positions.resize(n, num_bits_per_pos); + /* (2a) Stream-read kmer file, compute F(kmer), externally + sort (F(kmer), pos_in_bucket) tuples by F(kmer). */ + std::atomic pos_num_runs{0}; { - std::ifstream in(fn, std::ifstream::binary); + std::vector pos_buffer; + pos_buffer.reserve(pos_buffer_capacity); + auto flush_pos_buffer = [&]() { + if (pos_buffer.empty()) return; + parallel_sort(pos_buffer, build_config.num_threads, + [](position_tuple const& a, position_tuple const& b) { + return a.mphf_pos < b.mphf_pos; + }); + const uint64_t id = pos_num_runs.fetch_add(1); + std::ofstream out(pos_run_filename(partition_id, id), + std::ofstream::binary); + if (!out.is_open()) { + throw std::runtime_error("cannot open positions tuple run file"); + } + out.write(reinterpret_cast(pos_buffer.data()), + pos_buffer.size() * sizeof(position_tuple)); + out.close(); + pos_buffer.clear(); + }; + + std::ifstream in(kmer_fn, std::ifstream::binary); if (!in.is_open()) { throw std::runtime_error("cannot open skew-partition tmp file"); } @@ -586,19 +707,54 @@ void dictionary_builder::build_sparse_and_skew_index( in.read(reinterpret_cast(&kmer.bits), sizeof(kmer.bits)); uint32_t pib; in.read(reinterpret_cast(&pib), sizeof(pib)); - cvb_positions.set(F(kmer), pib); + const uint64_t pos = F(kmer); + if (pos_buffer.size() == pos_buffer_capacity) flush_pos_buffer(); + pos_buffer.emplace_back(pos, pib); } in.close(); + std::remove(kmer_fn.c_str()); + flush_pos_buffer(); } - std::remove(fn.c_str()); - auto& P = positions[partition_id]; - cvb_positions.build(P); + /* (2b) Stream sorted tuples through the streaming + compact_vector writer to produce the partition's + cvb_positions tmp file. */ + { + spilled.skew_positions_paths[partition_id] = + skew_positions_filename(partition_id); + streaming_compact_vector_writer pos_writer; + pos_writer.open(spilled.skew_positions_paths[partition_id], n, + num_bits_per_pos); + + struct pos_run_names_iterator { + pos_run_names_iterator(uint64_t partition_id, + std::function fn) + : i(0), partition_id(partition_id), fn(std::move(fn)) {} + std::string operator*() { return fn(partition_id, i); } + void operator++() { ++i; } + uint64_t i; + uint64_t partition_id; + std::function fn; + }; + pos_run_names_iterator names_it(partition_id, pos_run_filename); + file_merging_iterator merger(names_it, pos_num_runs.load()); + while (merger.has_next()) { + position_tuple pt = *merger; + pos_writer.set(pt.mphf_pos, pt.pib); + merger.next(); + } + merger.close(); + pos_writer.finalize(); + } + + /* Cleanup the position-tuple run files. */ + for (uint64_t i = 0; i != pos_num_runs.load(); ++i) { + std::remove(pos_run_filename(partition_id, i).c_str()); + } if (build_config.verbose) { - std::cout << " built positions[" << partition_id << "] for " << P.size() - << " kmers; bits/key = " << (P.num_bytes() * 8.0) / P.size() - << std::endl; + std::cout << " built positions[" << partition_id << "] for " << n + << " kmers; bits/key = " << num_bits_per_pos << std::endl; } } @@ -613,7 +769,8 @@ void dictionary_builder::build_sparse_and_skew_index( } d.m_ssi.ski.mphfs = std::move(mphfs); - d.m_ssi.ski.positions = std::move(positions); + /* d.m_ssi.ski.positions stays empty here; it will be populated + either by step 8 (materialize) or substituted at stream-save. */ } timer.stop(); From 2c73e09050bf48a7504de954d375a53a1d0db816 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 5 May 2026 17:27:56 +0000 Subject: [PATCH 12/32] spill the codewords + per-skew-partition MPHFs to disk The accumulating in-RAM MPHFs were the dominant resident memory at the end of phase C. For the HPRC k=63 m=31 canonical benchmark (5.9 B kmers, ~5.4 B in skew at ~3 bits/key) they sum to ~2 GB held simultaneously through save. This commit spills them to disk and concatenates at save: - Step 5 (hash_minimizers): the codewords MPHF is no longer needed after the minimizer values are remapped. Save it to a tmp file via essentials::save and default-assign the in-RAM struct to free it. - Step 7.2 phase C: after each partition's cvb_positions has been written to disk, save that partition's MPHF to a tmp file and default-assign it. Subsequent partitions never coexist with prior partitions' MPHFs. - The streaming saver gains an address+type-keyed substitution map (typed_address_sub). Type discrimination is necessary because in C++ a struct's address coincides with the address of its first member when the struct has standard layout, so address alone is ambiguous (visiting sparse_and_skew_index would otherwise match a substitution registered for its first member's first member, the codewords MPHF). - dictionary_builder uses a `register_sub` helper that captures the type via typeid for each registration. The saver only fires the substitution when both address and std::type_index(typeid(T)) match. - Materializing build() flow loads each spilled MPHF back via essentials::loader so queries work afterward. Verified byte-identical output vs the previous commit on salmonella_enterica m=7, plus full --check on regular, --canonical, multi-thread (-t 4), and --weighted, plus a streaming-save round-trip (sshash check + sshash query) with all five "EVERYTHING OK!" suites and 100% positive matches. No tmp file leaks. After this commit, the proportional-to-input items in RAM during build are bounded by --ram-limit: - pthash external-memory builder (capped at ram_limit/2), - one current MPHF being built (~bits/key * partition_size), - per-step external-sort buffers (capped), - the stored compact_vectors, MPHFs, etc. all spill. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- include/builder/dictionary_builder.hpp | 86 ++++++++++++---- include/builder/streaming_save.hpp | 103 +++++++++++++------- src/builder/build_sparse_and_skew_index.cpp | 15 +++ 3 files changed, 153 insertions(+), 51 deletions(-) diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp index 7e4af93..58a3ab1 100644 --- a/include/builder/dictionary_builder.hpp +++ b/include/builder/dictionary_builder.hpp @@ -36,14 +36,20 @@ struct spilled_components { std::string mid_load_buckets_path; std::string heavy_load_buckets_path; std::vector skew_positions_paths; // one entry per skew partition + std::string codewords_mphf_path; // step-4 minimizers MPHF + std::vector skew_mphfs_paths; // one entry per skew partition void clear_files() { if (!control_codewords_path.empty()) std::remove(control_codewords_path.c_str()); if (!mid_load_buckets_path.empty()) std::remove(mid_load_buckets_path.c_str()); if (!heavy_load_buckets_path.empty()) std::remove(heavy_load_buckets_path.c_str()); + if (!codewords_mphf_path.empty()) std::remove(codewords_mphf_path.c_str()); for (auto const& p : skew_positions_paths) { if (!p.empty()) std::remove(p.c_str()); } + for (auto const& p : skew_mphfs_paths) { + if (!p.empty()) std::remove(p.c_str()); + } } }; @@ -93,28 +99,46 @@ struct dictionary_builder // { run_steps_1_through_7(d, filename); do_step("step 8 (stream-save dictionary to disk)", [&]() { - /* Populate placeholder compact_vectors at the visit slots whose - byte content the saver will substitute from disk tmp files. */ - std::unordered_map subs; + /* Address+type-keyed substitution map. The saver replaces the + visit byte content of any registered (address, type) pair + with the bytes of the corresponding tmp file. Type matching + disambiguates the case where a struct's address coincides + with the address of its first member. */ + std::unordered_map subs; if (!spilled.control_codewords_path.empty()) { - subs[&d.m_ssi.codewords.control_codewords] = spilled.control_codewords_path; + register_sub(subs, &d.m_ssi.codewords.control_codewords, + spilled.control_codewords_path); } if (!spilled.mid_load_buckets_path.empty()) { - subs[&d.m_ssi.mid_load_buckets] = spilled.mid_load_buckets_path; + register_sub(subs, &d.m_ssi.mid_load_buckets, spilled.mid_load_buckets_path); } if (!spilled.heavy_load_buckets_path.empty()) { - subs[&d.m_ssi.ski.heavy_load_buckets] = spilled.heavy_load_buckets_path; + register_sub(subs, &d.m_ssi.ski.heavy_load_buckets, + spilled.heavy_load_buckets_path); + } + if (!spilled.codewords_mphf_path.empty()) { + register_sub(subs, &d.m_ssi.codewords.mphf, spilled.codewords_mphf_path); } - /* skew positions: populate the owning_span with placeholders so - the visit walks the right number of entries and we can take - their addresses for substitution. */ - const std::size_t num_part = spilled.skew_positions_paths.size(); + /* Skew positions / mphfs: populate the owning_spans with + placeholders so the visit walks the right number of entries + and we can take their addresses for substitution. */ + const std::size_t num_part = std::max(spilled.skew_positions_paths.size(), + spilled.skew_mphfs_paths.size()); if (num_part > 0) { - std::vector placeholders(num_part); - d.m_ssi.ski.positions = std::move(placeholders); - for (std::size_t i = 0; i != num_part; ++i) { + std::vector position_placeholders(num_part); + std::vector> mphf_placeholders(num_part); + d.m_ssi.ski.positions = std::move(position_placeholders); + d.m_ssi.ski.mphfs = std::move(mphf_placeholders); + for (std::size_t i = 0; i != spilled.skew_positions_paths.size(); ++i) { if (!spilled.skew_positions_paths[i].empty()) { - subs[&d.m_ssi.ski.positions[i]] = spilled.skew_positions_paths[i]; + register_sub(subs, &d.m_ssi.ski.positions[i], + spilled.skew_positions_paths[i]); + } + } + for (std::size_t i = 0; i != spilled.skew_mphfs_paths.size(); ++i) { + if (!spilled.skew_mphfs_paths[i].empty()) { + register_sub(subs, &d.m_ssi.ski.mphfs[i], + spilled.skew_mphfs_paths[i]); } } } @@ -157,16 +181,30 @@ struct dictionary_builder // materialize_compact_vector_from_file(d.m_ssi.ski.heavy_load_buckets, spilled.heavy_load_buckets_path); } - const std::size_t num_part = spilled.skew_positions_paths.size(); + /* Reload the spilled MPHFs back into RAM so queries work. */ + if (!spilled.codewords_mphf_path.empty()) { + essentials::loader loader(spilled.codewords_mphf_path.c_str()); + loader.visit(d.m_ssi.codewords.mphf); + } + const std::size_t num_part = std::max(spilled.skew_positions_paths.size(), + spilled.skew_mphfs_paths.size()); if (num_part > 0) { std::vector positions_vec(num_part); - for (std::size_t i = 0; i != num_part; ++i) { + std::vector> mphfs_vec(num_part); + for (std::size_t i = 0; i != spilled.skew_positions_paths.size(); ++i) { if (!spilled.skew_positions_paths[i].empty()) { materialize_compact_vector_from_file(positions_vec[i], spilled.skew_positions_paths[i]); } } + for (std::size_t i = 0; i != spilled.skew_mphfs_paths.size(); ++i) { + if (!spilled.skew_mphfs_paths[i].empty()) { + essentials::loader loader(spilled.skew_mphfs_paths[i].c_str()); + loader.visit(mphfs_vec[i]); + } + } d.m_ssi.ski.positions = std::move(positions_vec); + d.m_ssi.ski.mphfs = std::move(mphfs_vec); } } @@ -288,7 +326,8 @@ struct dictionary_builder // std::string filename = minimizers.get_minimizers_filename(); std::ifstream input(filename, std::ifstream::binary); - auto const& f = d.m_ssi.codewords.mphf; + auto& f_mut = d.m_ssi.codewords.mphf; + auto const& f = f_mut; const uint64_t num_threads = build_config.num_threads; const uint64_t num_files_to_merge = minimizers.num_files_to_merge(); @@ -343,6 +382,19 @@ struct dictionary_builder // } input.close(); + + /* The codewords MPHF is no longer needed during build (step 6 onward + reads minimizer values that step 5 has already replaced with + mphf hashes; step 7 references mphf hashes only as bucket ids). + Spill it to disk and free its in-RAM footprint. */ + { + std::stringstream ss; + ss << build_config.tmp_dirname << "/sshash.tmp.run_" << strings_run_id + << ".codewords_mphf.bin"; + spilled.codewords_mphf_path = ss.str(); + essentials::save(f_mut, spilled.codewords_mphf_path.c_str()); + f_mut = minimizers_pthash_type{}; + } } }; diff --git a/include/builder/streaming_save.hpp b/include/builder/streaming_save.hpp index 9e9de1d..7ccc87c 100644 --- a/include/builder/streaming_save.hpp +++ b/include/builder/streaming_save.hpp @@ -5,40 +5,63 @@ #include #include #include +#include +#include #include +#include #include #include "essentials.hpp" #include "external/pthash/external/bits/include/bit_vector.hpp" -#include "external/pthash/external/bits/include/compact_vector.hpp" #include "include/builder/disk_backed_strings.hpp" namespace sshash { /* - A saver that mirrors `essentials::generic_saver`, except that any visit - to a specific `bits::bit_vector` instance (identified by address) is - redirected to `disk_backed_strings::save_to`, which streams the strings - bytes from the on-disk tmp file. Likewise, visits to `bits::compact_vector` - instances whose addresses appear in `compact_vector_subs` are replaced - with byte-for-byte streaming from the corresponding tmp file (which is - expected to be in `bits::compact_vector::visit_impl`'s on-disk format). - - Address-based identification means we don't need to add any intermediate - type or marker to bits::bit_vector / bits::compact_vector themselves. + A typed substitution: the saver replaces the visit byte content of an + object at `address` with the bytes of `filename` only if the visited + type T satisfies `std::type_index(typeid(T)) == type`. + + Type discrimination is necessary because in C++ a struct's address + coincides with the address of its first member when the struct has + standard layout. Without the type check, a substitution registered + for an inner field would also fire (incorrectly) on every enclosing + parent that shares its address. +*/ +struct typed_address_sub { + std::string filename; + std::type_index type; +}; + +/* + A saver that mirrors `essentials::generic_saver`, except for two + interception mechanisms used during streaming save: + + 1. The `bits::bit_vector` instance whose address matches `strings_addr` + has its bytes streamed from `strings_storage` (which writes the + same on-disk format `bits::bit_vector::visit_impl` produces). + + 2. Any object whose address appears in `address_subs` has its visit + byte content replaced by a copy of the corresponding tmp file. + This is type-agnostic — it works for `bits::compact_vector`, for + pthash MPHFs, or anything else whose serialized form has been + saved to a file via `essentials::save`. + + The substitution check is performed at the start of every visit + call (whatever T is); if no match, the call falls through to the + regular `essentials::generic_saver` logic (POD via save_pod, or + recursion via val.visit(*this)). */ struct streaming_strings_saver { - streaming_strings_saver( - std::ostream& os, // - bits::bit_vector const* strings_addr, // - disk_backed_strings const* strings_storage, // - std::unordered_map compact_vector_subs // - ) + streaming_strings_saver(std::ostream& os, // + bits::bit_vector const* strings_addr, // + disk_backed_strings const* strings_storage, // + std::unordered_map address_subs) : m_os(os) , m_strings_addr(strings_addr) , m_strings_storage(strings_storage) - , m_compact_vector_subs(std::move(compact_vector_subs)) { + , m_address_subs(std::move(address_subs)) { if (m_strings_addr == nullptr || m_strings_storage == nullptr) { throw std::runtime_error("streaming_strings_saver requires non-null arguments"); } @@ -46,19 +69,25 @@ struct streaming_strings_saver { template void visit(T const& val) { + /* Type+address substitution (compact_vectors, MPHFs, etc.). + Both must match: address alone is ambiguous when a struct + shares its address with its first member. */ + void const* addr = static_cast(&val); + auto it = m_address_subs.find(addr); + if (it != m_address_subs.end() && it->second.type == std::type_index(typeid(T))) { + stream_file_into_os(it->second.filename); + return; + } + /* Strings: dedicated callback because the on-disk strings file + holds raw words (not the bits::bit_vector serialized form); + `disk_backed_strings::save_to(os)` writes the visit_impl format + on the fly. */ if constexpr (std::is_same_v) { if (&val == m_strings_addr) { m_strings_storage->save_to(m_os); return; } } - if constexpr (std::is_same_v) { - auto it = m_compact_vector_subs.find(&val); - if (it != m_compact_vector_subs.end()) { - stream_file_into_os(it->second); - return; - } - } if constexpr (essentials::is_pod::value) { essentials::save_pod(m_os, val); } else { @@ -82,7 +111,7 @@ struct streaming_strings_saver { std::ostream& m_os; bits::bit_vector const* m_strings_addr; disk_backed_strings const* m_strings_storage; - std::unordered_map m_compact_vector_subs; + std::unordered_map m_address_subs; template void visit_seq(Vec const& vec) { @@ -115,25 +144,31 @@ struct streaming_strings_saver { /* Save `t` to `filename`. Any embedded bits::bit_vector matching `strings_addr` is streamed from `strings_storage`; any embedded - bits::compact_vector whose address appears in `compact_vector_subs` - has its bytes copied from the corresponding tmp file. Other fields are - saved via the standard essentials path. + object whose address appears in `address_subs` has its bytes copied + from the corresponding tmp file. Other fields are saved via the + standard essentials path. */ template void save_streaming(T const& t, char const* filename, // bits::bit_vector const* strings_addr, // disk_backed_strings const& strings_storage, // - std::unordered_map // - compact_vector_subs = {}) // -{ + std::unordered_map address_subs = {}) { std::ofstream out(filename, std::ios::binary); if (!out.good()) { throw std::runtime_error(std::string("error opening file '") + filename + "' for writing"); } - streaming_strings_saver saver(out, strings_addr, &strings_storage, - std::move(compact_vector_subs)); + streaming_strings_saver saver(out, strings_addr, &strings_storage, std::move(address_subs)); saver.visit(t); out.close(); } +/* Helper: register a typed substitution at the address of `addr`. */ +template +inline void register_sub(std::unordered_map& subs, + T const* addr, std::string filename) { + subs.insert_or_assign(static_cast(addr), + typed_address_sub{std::move(filename), + std::type_index(typeid(T))}); +} + } // namespace sshash diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp index 3e9851d..d4deba3 100644 --- a/src/builder/build_sparse_and_skew_index.cpp +++ b/src/builder/build_sparse_and_skew_index.cpp @@ -593,6 +593,7 @@ void dictionary_builder::build_sparse_and_skew_index( */ { spilled.skew_positions_paths.assign(num_partitions, std::string()); + spilled.skew_mphfs_paths.assign(num_partitions, std::string()); std::vector> mphfs; mphfs.resize(num_partitions); @@ -619,6 +620,12 @@ void dictionary_builder::build_sparse_and_skew_index( << ".skew_positions.p" << partition_id << ".bin"; return ss.str(); }; + auto skew_mphf_filename = [&](uint64_t partition_id) { + std::stringstream ss; + ss << build_config.tmp_dirname << "/sshash.tmp.run_" << pos_run_basename_id + << ".skew_mphf.p" << partition_id << ".bin"; + return ss.str(); + }; const uint64_t pos_buffer_capacity = std::max( uint64_t(1) << 16, @@ -756,6 +763,14 @@ void dictionary_builder::build_sparse_and_skew_index( std::cout << " built positions[" << partition_id << "] for " << n << " kmers; bits/key = " << num_bits_per_pos << std::endl; } + + /* Spill the partition's MPHF to disk (no longer needed + during build) and free its in-RAM footprint. The + accumulating skew MPHFs were the dominant resident + memory at the end of phase C. */ + spilled.skew_mphfs_paths[partition_id] = skew_mphf_filename(partition_id); + essentials::save(F, spilled.skew_mphfs_paths[partition_id].c_str()); + F = kmers_pthash_type{}; } /* advance partition state for the next iteration */ From dddee47de79dcd72717b9107eed4b9d86a7b2ed5 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 5 May 2026 18:01:23 +0000 Subject: [PATCH 13/32] cap pthash mphf num_threads by --ram-limit pthash's `partitioned_phf::build` builds the partitioned MPHF's sub-partitions in parallel, with `mphf_build_config.num_threads` sub-partitions running simultaneously. Each per-partition build allocates a `pairs_t` of ~`avg_partition_size * 16 B` (~48 MB with the default avg_partition_size = 3M); with -t 64 that balloons to ~3 GB of pthash internal memory, dominating the build's RSS regardless of how aggressively sshash spills its own structures. Add `util::cap_mphf_num_threads(requested, ram_limit_in_GiB)`: budget ~ram_limit/4 GiB for pthash's per-partition build memory and conservatively assume 64 MiB per parallel sub-partition (48 MB pairs_t + sort temporary + slack). Apply at the two pthash build sites (step 4 codewords MPHF, step 7.2 phase C skew partition MPHFs). For -g 2 -t 64 this caps pthash to 8 threads (1 GiB-pthash budget over 64 MiB-per-thread = 16 capped further by /4 budget fraction = 8). Build time may increase modestly; peak RSS should drop well below the previous ~3.3 GB toward the 2 GiB target. Verified byte-identical output on salmonella_enterica m=7, plus full --check on regular, --canonical, multi-thread (-t 4), and --weighted. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- include/minimizers_control_map.hpp | 3 ++- include/util.hpp | 22 +++++++++++++++++++++ src/builder/build_sparse_and_skew_index.cpp | 3 ++- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/include/minimizers_control_map.hpp b/include/minimizers_control_map.hpp index 834d607..889917b 100644 --- a/include/minimizers_control_map.hpp +++ b/include/minimizers_control_map.hpp @@ -13,7 +13,8 @@ struct minimizers_control_map // mphf_build_config.alpha = 0.94; mphf_build_config.seed = util::get_seed_for_hash_function(build_config); mphf_build_config.verbose = false; - mphf_build_config.num_threads = build_config.num_threads; + mphf_build_config.num_threads = + util::cap_mphf_num_threads(build_config.num_threads, build_config.ram_limit_in_GiB); mphf_build_config.avg_partition_size = constants::avg_partition_size; mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2; mphf_build_config.tmp_dir = build_config.tmp_dirname; diff --git a/include/util.hpp b/include/util.hpp index bf9bebd..fc93013 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -199,6 +199,28 @@ static inline uint64_t get_seed_for_hash_function(build_configuration const& bui return build_config.seed != my_favourite_seed ? my_favourite_seed : ~my_favourite_seed; } +/* + Cap pthash's `num_threads` so its `partitioned_phf::build` parallelism + fits in the user's --ram-limit budget. + + pthash builds sub-partitions of the partitioned MPHF in parallel; each + sub-partition allocates a `pairs_t` vector of roughly + `avg_partition_size * sizeof(pair)` bytes during `map`/sort. With the + default `avg_partition_size = 3,000,000` and ~16 B/pair this is on + the order of ~48 MB per thread; conservatively budget 64 MB per + parallel sub-partition (covers the sort temporary + small constants). + + With `--ram-limit = G` GiB we allow pthash up to `G/4` GiB for this + parallel build memory, capping pthash threads accordingly. +*/ +static inline uint64_t cap_mphf_num_threads(uint64_t requested_num_threads, + uint64_t ram_limit_in_GiB) { + constexpr uint64_t per_thread_estimate_bytes = uint64_t(64) << 20; // 64 MiB + const uint64_t budget_bytes = (ram_limit_in_GiB * essentials::GiB) / 4; + const uint64_t max_parallel = std::max(1, budget_bytes / per_thread_estimate_bytes); + return std::min(requested_num_threads, max_parallel); +} + [[maybe_unused]] static bool ends_with(std::string const& str, std::string const& pattern) { if (pattern.size() > str.size()) return false; return std::equal(pattern.begin(), pattern.end(), str.end() - pattern.size()); diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp index d4deba3..eca3498 100644 --- a/src/builder/build_sparse_and_skew_index.cpp +++ b/src/builder/build_sparse_and_skew_index.cpp @@ -602,7 +602,8 @@ void dictionary_builder::build_sparse_and_skew_index( mphf_build_config.alpha = 0.94; mphf_build_config.seed = util::get_seed_for_hash_function(build_config); mphf_build_config.verbose = false; - mphf_build_config.num_threads = build_config.num_threads; + mphf_build_config.num_threads = + util::cap_mphf_num_threads(build_config.num_threads, build_config.ram_limit_in_GiB); mphf_build_config.avg_partition_size = constants::avg_partition_size; mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2; mphf_build_config.tmp_dir = build_config.tmp_dirname; From e18b9b4146dd5f8c0f55466dd772fdfe4c34219d Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 5 May 2026 18:06:22 +0000 Subject: [PATCH 14/32] mphf thread cap: only kick in when budget would actually be exceeded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous cap formula (ram_limit/4) was too aggressive — it fired even at default -g 8 -t 64 where capping isn't needed. That silently changed pthash's parallelism on configurations the user didn't intend to be tight, breaking expectations of "-t means t threads". Loosen the budget to ram_limit/2: leave roughly half of --ram-limit available to pthash's parallel sub-partition build memory (the other half covers the streaming buffers, external sort buffers, etc.). Cap pthash threads only when the user's -t would push pthash past that half-budget. Common cases now pass through unchanged: -g 8 -t 64 no cap (4 GiB / 64 MiB = 64 >= 64) -g 4 -t 32 no cap -g 2 -t 16 no cap -g 16 -t 128 no cap -g 2 -t 64 cap to 16 (pathological: tight budget vs many threads) -g 4 -t 64 cap to 32 When the cap fires, log a clear warning naming the MPHF, the requested thread count, the cap, and the budget. Verbose mode only. Verified byte-identical output and full --check matrix still pass. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- include/minimizers_control_map.hpp | 5 ++- include/util.hpp | 43 ++++++++++++++------- src/builder/build_sparse_and_skew_index.cpp | 5 ++- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/include/minimizers_control_map.hpp b/include/minimizers_control_map.hpp index 889917b..9db52d6 100644 --- a/include/minimizers_control_map.hpp +++ b/include/minimizers_control_map.hpp @@ -13,8 +13,9 @@ struct minimizers_control_map // mphf_build_config.alpha = 0.94; mphf_build_config.seed = util::get_seed_for_hash_function(build_config); mphf_build_config.verbose = false; - mphf_build_config.num_threads = - util::cap_mphf_num_threads(build_config.num_threads, build_config.ram_limit_in_GiB); + mphf_build_config.num_threads = util::cap_mphf_num_threads( + build_config.num_threads, build_config.ram_limit_in_GiB, build_config.verbose, + "minimizers MPHF"); mphf_build_config.avg_partition_size = constants::avg_partition_size; mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2; mphf_build_config.tmp_dir = build_config.tmp_dirname; diff --git a/include/util.hpp b/include/util.hpp index fc93013..7ab56dd 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -200,25 +200,38 @@ static inline uint64_t get_seed_for_hash_function(build_configuration const& bui } /* - Cap pthash's `num_threads` so its `partitioned_phf::build` parallelism - fits in the user's --ram-limit budget. - - pthash builds sub-partitions of the partitioned MPHF in parallel; each - sub-partition allocates a `pairs_t` vector of roughly - `avg_partition_size * sizeof(pair)` bytes during `map`/sort. With the - default `avg_partition_size = 3,000,000` and ~16 B/pair this is on - the order of ~48 MB per thread; conservatively budget 64 MB per - parallel sub-partition (covers the sort temporary + small constants). - - With `--ram-limit = G` GiB we allow pthash up to `G/4` GiB for this - parallel build memory, capping pthash threads accordingly. + Cap pthash's `num_threads` only when leaving it equal to the user's + `-t` would push the build past `--ram-limit`. + + pthash's `partitioned_phf::build` builds the partitioned MPHF's + sub-partitions in parallel; each sub-partition allocates a `pairs_t` + of roughly `avg_partition_size * sizeof(pair)` bytes during + `map`/sort. With the default `avg_partition_size = 3,000,000` this is + on the order of ~48 MB per thread; we conservatively budget 64 MiB + per parallel sub-partition (covers the sort temporary + slack). + + The other build steps (the streaming buffers, the per-step external + sort buffers, etc.) use up to roughly half of `--ram-limit`, so we + leave the other half available to pthash. Cap pthash threads so that + `64 MiB * threads <= ram_limit/2`. If the user's `-t` already fits, + we don't touch it: this only kicks in for pathologically tight + budgets (small `--ram-limit` combined with large `-t`). */ static inline uint64_t cap_mphf_num_threads(uint64_t requested_num_threads, - uint64_t ram_limit_in_GiB) { + uint64_t ram_limit_in_GiB, + bool verbose, + char const* mphf_name) { constexpr uint64_t per_thread_estimate_bytes = uint64_t(64) << 20; // 64 MiB - const uint64_t budget_bytes = (ram_limit_in_GiB * essentials::GiB) / 4; + const uint64_t budget_bytes = (ram_limit_in_GiB * essentials::GiB) / 2; const uint64_t max_parallel = std::max(1, budget_bytes / per_thread_estimate_bytes); - return std::min(requested_num_threads, max_parallel); + if (requested_num_threads <= max_parallel) return requested_num_threads; + if (verbose) { + std::cout << " --> WARNING: capping pthread mphf threads for " << mphf_name + << " from " << requested_num_threads << " to " << max_parallel + << " to fit --ram-limit=" << ram_limit_in_GiB << " GiB" + << " (pthash uses ~64 MiB per parallel sub-partition build)" << std::endl; + } + return max_parallel; } [[maybe_unused]] static bool ends_with(std::string const& str, std::string const& pattern) { diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp index eca3498..8e7e267 100644 --- a/src/builder/build_sparse_and_skew_index.cpp +++ b/src/builder/build_sparse_and_skew_index.cpp @@ -602,8 +602,9 @@ void dictionary_builder::build_sparse_and_skew_index( mphf_build_config.alpha = 0.94; mphf_build_config.seed = util::get_seed_for_hash_function(build_config); mphf_build_config.verbose = false; - mphf_build_config.num_threads = - util::cap_mphf_num_threads(build_config.num_threads, build_config.ram_limit_in_GiB); + mphf_build_config.num_threads = util::cap_mphf_num_threads( + build_config.num_threads, build_config.ram_limit_in_GiB, build_config.verbose, + "skew partition MPHF"); mphf_build_config.avg_partition_size = constants::avg_partition_size; mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2; mphf_build_config.tmp_dir = build_config.tmp_dirname; From 869f901d5ba6952d0566ce5e9c05f251bf709884 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 5 May 2026 18:17:16 +0000 Subject: [PATCH 15/32] mphf: scale avg_partition_size to honor -t; cap only when pathological Previous approach quietly capped pthash's num_threads under tight --ram-limit, which broke the user's expectation that "-t N means N threads". The actual lever pthash exposes for per-thread memory is `avg_partition_size`: per-partition build memory is roughly `avg_partition_size * sizeof(pair)` bytes. Replace the unconditional thread cap with a configuration that divides the pthash RAM budget (half of --ram-limit) evenly across the user's requested threads, then derives an `avg_partition_size` for that per-thread budget: - per_thread_budget = (ram_limit / 2) / num_threads - avg = per_thread_budget / per_key_estimate (32 B) - if avg >= default (3M): use default; -t honored - elif avg >= floor (100K): use avg; -t honored - else: cap -t so floor fits, warn The warning only fires in the pathological case (so many threads at so little RAM that even pthash's quality floor can't fit). Common configurations pass through unchanged: -t 1 -g 8 no change (default partition 3M) -t 64 -g 8 -t honored, partition 2M -t 64 -g 4 -t honored, partition 1M -t 64 -g 2 -t honored, partition 524K -t 256 -g 1 warn + cap to 167 (-t couldn't be honored) Verified byte-identical output on -t 1 builds (the default single-thread case) and full --check matrix on regular, --canonical, multi-thread, and --weighted. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- include/minimizers_control_map.hpp | 7 +- include/util.hpp | 83 ++++++++++++++------- src/builder/build_sparse_and_skew_index.cpp | 8 +- 3 files changed, 63 insertions(+), 35 deletions(-) diff --git a/include/minimizers_control_map.hpp b/include/minimizers_control_map.hpp index 9db52d6..5665cf9 100644 --- a/include/minimizers_control_map.hpp +++ b/include/minimizers_control_map.hpp @@ -13,10 +13,9 @@ struct minimizers_control_map // mphf_build_config.alpha = 0.94; mphf_build_config.seed = util::get_seed_for_hash_function(build_config); mphf_build_config.verbose = false; - mphf_build_config.num_threads = util::cap_mphf_num_threads( - build_config.num_threads, build_config.ram_limit_in_GiB, build_config.verbose, - "minimizers MPHF"); - mphf_build_config.avg_partition_size = constants::avg_partition_size; + util::configure_mphf_threads_and_partition(mphf_build_config, build_config.num_threads, + build_config.ram_limit_in_GiB, + build_config.verbose, "minimizers MPHF"); mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2; mphf_build_config.tmp_dir = build_config.tmp_dirname; diff --git a/include/util.hpp b/include/util.hpp index 7ab56dd..e45574f 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -200,38 +200,67 @@ static inline uint64_t get_seed_for_hash_function(build_configuration const& bui } /* - Cap pthash's `num_threads` only when leaving it equal to the user's - `-t` would push the build past `--ram-limit`. + Configure pthash's `num_threads` and `avg_partition_size` so that the + parallel sub-partition build memory fits in the user's --ram-limit + without unilaterally reducing the user's requested thread count. pthash's `partitioned_phf::build` builds the partitioned MPHF's sub-partitions in parallel; each sub-partition allocates a `pairs_t` - of roughly `avg_partition_size * sizeof(pair)` bytes during - `map`/sort. With the default `avg_partition_size = 3,000,000` this is - on the order of ~48 MB per thread; we conservatively budget 64 MiB - per parallel sub-partition (covers the sort temporary + slack). - - The other build steps (the streaming buffers, the per-step external - sort buffers, etc.) use up to roughly half of `--ram-limit`, so we - leave the other half available to pthash. Cap pthash threads so that - `64 MiB * threads <= ram_limit/2`. If the user's `-t` already fits, - we don't touch it: this only kicks in for pathologically tight - budgets (small `--ram-limit` combined with large `-t`). + of roughly `avg_partition_size * sizeof(pair)` bytes during the + `map` + sort step. So per-thread peak ≈ `avg_partition_size * + per_key_bytes`. We can scale `avg_partition_size` down to fit any + desired per-thread budget — the only floor is pthash's hash-search + quality, for which `avg_partition_size` should not go below ~100k. + + Strategy: split half of `--ram-limit` evenly across the requested + threads (the other half covers sshash's own buffers). For each + thread compute `per_thread_budget`, derive a candidate + `avg_partition_size`, and use it (clamped at the default upper end + so we never make partitions larger than usual). Only when the + derived `avg_partition_size` falls below the floor do we fall back + to capping threads — in that case we emit a warning naming the MPHF + so the user knows the requested -t couldn't be honored. */ -static inline uint64_t cap_mphf_num_threads(uint64_t requested_num_threads, - uint64_t ram_limit_in_GiB, - bool verbose, - char const* mphf_name) { - constexpr uint64_t per_thread_estimate_bytes = uint64_t(64) << 20; // 64 MiB - const uint64_t budget_bytes = (ram_limit_in_GiB * essentials::GiB) / 2; - const uint64_t max_parallel = std::max(1, budget_bytes / per_thread_estimate_bytes); - if (requested_num_threads <= max_parallel) return requested_num_threads; - if (verbose) { - std::cout << " --> WARNING: capping pthread mphf threads for " << mphf_name - << " from " << requested_num_threads << " to " << max_parallel - << " to fit --ram-limit=" << ram_limit_in_GiB << " GiB" - << " (pthash uses ~64 MiB per parallel sub-partition build)" << std::endl; +static inline void configure_mphf_threads_and_partition( + pthash::build_configuration& mphf, // + uint64_t requested_num_threads, // + uint64_t ram_limit_in_GiB, // + bool verbose, // + char const* mphf_name) // +{ + constexpr uint64_t per_key_bytes = 32; // pairs_t entry + sort slack + constexpr uint64_t min_avg_partition_size = uint64_t(100) * 1000; + const uint64_t default_avg = constants::avg_partition_size; + + const uint64_t pthash_ram = (ram_limit_in_GiB * essentials::GiB) / 2; + const uint64_t per_thread = + pthash_ram / std::max(1, requested_num_threads); + const uint64_t avg_for_thread_budget = per_thread / per_key_bytes; + + if (avg_for_thread_budget >= default_avg) { + /* Plenty of RAM per thread — keep the default partition size. */ + mphf.num_threads = requested_num_threads; + mphf.avg_partition_size = default_avg; + } else if (avg_for_thread_budget >= min_avg_partition_size) { + /* Tighter per-thread budget: shrink partitions to fit; threads + honored. */ + mphf.num_threads = requested_num_threads; + mphf.avg_partition_size = avg_for_thread_budget; + } else { + /* Pathological: not enough RAM per thread even at the floor. + Cap threads so the floor fits. */ + const uint64_t max_threads = std::max( + 1, pthash_ram / (per_key_bytes * min_avg_partition_size)); + if (verbose) { + std::cout << " --> WARNING: not enough RAM per thread for " << mphf_name + << " (--ram-limit=" << ram_limit_in_GiB << " GiB, " + << requested_num_threads << " requested threads): capping to " + << max_threads << " threads at min partition size " + << min_avg_partition_size << std::endl; + } + mphf.num_threads = max_threads; + mphf.avg_partition_size = min_avg_partition_size; } - return max_parallel; } [[maybe_unused]] static bool ends_with(std::string const& str, std::string const& pattern) { diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp index 8e7e267..aa8ea08 100644 --- a/src/builder/build_sparse_and_skew_index.cpp +++ b/src/builder/build_sparse_and_skew_index.cpp @@ -602,10 +602,10 @@ void dictionary_builder::build_sparse_and_skew_index( mphf_build_config.alpha = 0.94; mphf_build_config.seed = util::get_seed_for_hash_function(build_config); mphf_build_config.verbose = false; - mphf_build_config.num_threads = util::cap_mphf_num_threads( - build_config.num_threads, build_config.ram_limit_in_GiB, build_config.verbose, - "skew partition MPHF"); - mphf_build_config.avg_partition_size = constants::avg_partition_size; + util::configure_mphf_threads_and_partition(mphf_build_config, build_config.num_threads, + build_config.ram_limit_in_GiB, + build_config.verbose, + "skew partition MPHF"); mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2; mphf_build_config.tmp_dir = build_config.tmp_dirname; From 8e4b0d8e3c543c1e9d70db82c10a224f30902913 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 5 May 2026 18:41:40 +0000 Subject: [PATCH 16/32] clamp --ram-limit to a 4 GiB floor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Below ~4 GiB the streaming buffers + pthash's internal working memory (which we don't fully control) can't practically be made to fit; squeezing further has diminishing returns. Rather than degrade the build into ever-tinier buffers and ever-smaller pthash partition sizes, treat 4 GiB as the effective floor — a modest requirement on today's desktops. Add `constants::min_ram_limit_in_GiB = 4` and apply it in the single validation/normalization step at the build entrypoint (both `dictionary::build` and `dictionary::build_streaming_save`). A NOTE is printed in verbose mode whenever the user's `-g` is raised. Configurations at or above the floor are unaffected. Verified byte-identical output on the default build, full --check on regular, --canonical, multi-thread, and --weighted. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- include/constants.hpp | 5 +++++ src/builder/build.cpp | 27 +++++++++++++++++++++------ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/include/constants.hpp b/include/constants.hpp index 80f20fa..a020a6a 100644 --- a/include/constants.hpp +++ b/include/constants.hpp @@ -4,6 +4,11 @@ namespace sshash::constants { constexpr uint64_t invalid_uint64 = uint64_t(-1); constexpr uint64_t default_ram_limit_in_GiB = 8; +/* Floor on --ram-limit. Below this the build's streaming buffers + pthash's + internal working memory can't usefully be made to fit; rather than degrade + further at very tight budgets, we clamp `-g` to at least this value + (modest by today's desktop standards). */ +constexpr uint64_t min_ram_limit_in_GiB = 4; constexpr uint64_t seed = 1; /* for PTHash */ diff --git a/src/builder/build.cpp b/src/builder/build.cpp index 76d0b97..f1354d9 100644 --- a/src/builder/build.cpp +++ b/src/builder/build.cpp @@ -8,8 +8,8 @@ namespace sshash { namespace { -inline void validate_build_config_or_throw(build_configuration const& bc, uint64_t max_k, - uint64_t max_m) { +inline void validate_and_normalize_build_config(build_configuration& bc, uint64_t max_k, + uint64_t max_m) { if (bc.k == 0) throw std::runtime_error("k must be > 0"); if (bc.k > max_k) { throw std::runtime_error("k must be less <= " + std::to_string(max_k) + @@ -21,6 +21,19 @@ inline void validate_build_config_or_throw(build_configuration const& bc, uint64 " but got m = " + std::to_string(bc.m)); } if (bc.m > bc.k) throw std::runtime_error("m must be <= k"); + + /* Clamp --ram-limit to the floor. Below this, the streaming buffers + plus pthash's internal working memory can't usefully be made to + fit; rather than try to squeeze further we treat the floor as the + effective budget. */ + if (bc.ram_limit_in_GiB < constants::min_ram_limit_in_GiB) { + if (bc.verbose) { + std::cout << " --> NOTE: --ram-limit raised from " << bc.ram_limit_in_GiB + << " GiB to the floor of " << constants::min_ram_limit_in_GiB << " GiB" + << std::endl; + } + bc.ram_limit_in_GiB = constants::min_ram_limit_in_GiB; + } } } // namespace @@ -29,8 +42,9 @@ template void dictionary::build(std::string const& filename, build_configuration const& build_config) // { - validate_build_config_or_throw(build_config, Kmer::max_k, Kmer::max_m); - dictionary_builder builder(build_config); + build_configuration bc = build_config; + validate_and_normalize_build_config(bc, Kmer::max_k, Kmer::max_m); + dictionary_builder builder(bc); builder.build(*this, filename); } @@ -39,8 +53,9 @@ void dictionary::build_streaming_save( std::string const& input_filename, build_configuration const& build_config, std::string const& output_filename) // { - validate_build_config_or_throw(build_config, Kmer::max_k, Kmer::max_m); - dictionary_builder builder(build_config); + build_configuration bc = build_config; + validate_and_normalize_build_config(bc, Kmer::max_k, Kmer::max_m); + dictionary_builder builder(bc); builder.build_streaming_save(*this, input_filename, output_filename); } From f68fa779d63bedb4d8a2e3cf49d12bc372834b7d Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 6 May 2026 08:23:16 +0000 Subject: [PATCH 17/32] bump pthash to claude/fix-pthash-memory-estimate-NhgTI Picks up two commits on the pthash branch: 90ace87 Account for input hash vector in construction memory estimate 9600827 Drop the redundant in-search hashes term and the comment The fix corrects pthash's `estimate_num_bytes_for_construction`, which underestimated per-partition residency by `num_keys * sizeof(hash_type)` (= 16 B/key for hash128) and by an extra `num_keys * 8 B` double-counted "in-search hashes" term. With the corrected estimate, pthash's `bytes < config.ram` flush gate in the parallel partitioned-PHF build path actually matches residency, so `mphf_build_config.ram = ram_limit/2` will now bind pthash's parallel build batch to that budget on inputs like HPRC k=63 m=31 canonical (where the previous underestimate had pthash's batch peak around 1.5x the configured budget). Verified byte-identical SSHash output before/after the bump on salmonella_enterica m=7 (the change is purely a memory-accounting fix). Full --check matrix passes. When pthash master receives this fix we'll bump again to that tip; for now we point at the branch tip directly so the HPRC benchmark can validate the new RSS bound. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- external/pthash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/pthash b/external/pthash index e04a192..9600827 160000 --- a/external/pthash +++ b/external/pthash @@ -1 +1 @@ -Subproject commit e04a1920ffeae9e7d876acd0362cab79605f7af3 +Subproject commit 960082760bdf2e7315c0b827e747acb84a2c7c99 From c550d532069948bd8f0b1b3a699b8cb8bfe37bf2 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 6 May 2026 08:44:01 +0000 Subject: [PATCH 18/32] tighten ram-proportional buffer caps from ram/4 to ram/8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the pthash memory-estimate fix, pthash's parallel-build batch correctly binds at `ram_limit/2`. The remaining excess on the HPRC k=63 m=31 -g 4 -t 64 benchmark was sshash's own buffers (each currently sized at ram_limit/4) piled on top: - step 5 minimizer-tuples buffer: up to RAM_available/3 (~1.1 GB at -g 4) — and freed but typically retained by glibc, so it lingers in process RSS through subsequent steps. - step 7.1 kmer-extraction request buffer: ram_limit/4 (1 GiB at -g 4). - step 7.2 phase C position-tuple buffer: ram_limit/4 (1 GiB at -g 4) — alive concurrently with pthash's parallel-build memory and the partition's MPHF. Halve them all (cap at ram_limit/8 = 512 MiB at -g 4). The external sorts get more flush rounds and slightly more disk I/O, but peak RSS during the heaviest step (7.2 phase C) drops by roughly: pthash 2 GiB + pos buffer 0.5 GiB + partition MPHF 0.3 GB + step-5 lingering 0.5 GiB ≈ 3.3 GiB vs the prior ~4.8 GiB observed peak. Should fit comfortably under -g 4. Verified byte-identical SSHash output on salmonella_enterica m=7 (more flush rounds = more intermediate run files but the sort+merge is order-stable for the bytes we care about). https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- include/builder/dictionary_builder.hpp | 16 +++++++++++++--- src/builder/build_sparse_and_skew_index.cpp | 11 ++++++++--- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp index 58a3ab1..b3e2b34 100644 --- a/include/builder/dictionary_builder.hpp +++ b/include/builder/dictionary_builder.hpp @@ -347,9 +347,19 @@ struct dictionary_builder // } const uint64_t num_super_kmers = minimizers.num_super_kmers(); - const uint64_t buffer_size = num_files_to_merge == 1 - ? num_super_kmers - : (RAM_available_in_bytes / (3 * sizeof(minimizer_tuple))); + /* Cap the in-RAM buffer at ram_limit/8 worth of tuples so that + even when subsequent steps fragment the heap, step 5's lingering + pages don't blow past the budget when stacked with later step's + allocations. */ + const uint64_t buffer_cap_bytes = + (build_config.ram_limit_in_GiB * essentials::GiB) / 8; + const uint64_t buffer_cap_records = + std::max(uint64_t(1) << 16, buffer_cap_bytes / sizeof(minimizer_tuple)); + const uint64_t buffer_size_unbounded = + num_files_to_merge == 1 + ? num_super_kmers + : (RAM_available_in_bytes / (3 * sizeof(minimizer_tuple))); + const uint64_t buffer_size = std::min(buffer_size_unbounded, buffer_cap_records); const uint64_t num_blocks = (num_super_kmers + buffer_size - 1) / buffer_size; assert(num_super_kmers > (num_blocks - 1) * buffer_size); diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp index aa8ea08..30ecdd5 100644 --- a/src/builder/build_sparse_and_skew_index.cpp +++ b/src/builder/build_sparse_and_skew_index.cpp @@ -307,12 +307,14 @@ void dictionary_builder::build_sparse_and_skew_index( }; /* External-sort buffer for kmer-extraction requests (formerly step 7.2 - phase A; now folded into the combined pass). */ + phase A; now folded into the combined pass). Capped at ram_limit/8 + so heap fragmentation across steps doesn't push peak RSS past the + --ram-limit budget. */ std::atomic num_request_runs{0}; const uint64_t request_buffer_capacity = std::max( uint64_t(1) << 16, (build_config.ram_limit_in_GiB * essentials::GiB) / - (4 * sizeof(kmer_extraction_request))); + (8 * sizeof(kmer_extraction_request))); std::vector request_buffer; request_buffer.reserve(request_buffer_capacity); auto flush_request_buffer = [&]() { @@ -629,9 +631,12 @@ void dictionary_builder::build_sparse_and_skew_index( return ss.str(); }; + /* Capped at ram_limit/8: this buffer is alive during phase C + alongside pthash's parallel-build memory and the currently- + building partition's MPHF, so it has to share the RAM budget. */ const uint64_t pos_buffer_capacity = std::max( uint64_t(1) << 16, - (build_config.ram_limit_in_GiB * essentials::GiB) / (4 * sizeof(position_tuple))); + (build_config.ram_limit_in_GiB * essentials::GiB) / (8 * sizeof(position_tuple))); uint64_t lower = min_size; uint64_t upper = 2 * lower; From 365758b7871b9ea69ddc6d886eae04d8b14fa074 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 6 May 2026 09:19:00 +0000 Subject: [PATCH 19/32] bump pthash submodule to master tip a95e814 PR #97 ("fix pthash memory estimate") landed in pthash master. This bump moves us from the (now-merged) branch tip 9600827 to the master squash-merge commit a95e814. The substantive content of `internal_memory_builder_single_phf.hpp` is byte-identical between the two, so SSHash's behavior is unchanged from f68fa77. This unpins us from the development branch URL and lets us track pthash master going forward. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- external/pthash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/pthash b/external/pthash index 9600827..a95e814 160000 --- a/external/pthash +++ b/external/pthash @@ -1 +1 @@ -Subproject commit 960082760bdf2e7315c0b827e747acb84a2c7c99 +Subproject commit a95e8147a8ba1fa33b57fa24de7b5e674423e9a7 From 526b64bb6f37d4629ad88c16db0d4d0547187ee4 Mon Sep 17 00:00:00 2001 From: jermp Date: Wed, 6 May 2026 16:17:04 +0200 Subject: [PATCH 20/32] clang format --- include/builder/dictionary_builder.hpp | 25 +++++----- .../builder/disk_backed_offsets_builder.hpp | 7 +-- include/builder/disk_backed_strings.hpp | 23 +++++---- .../streaming_compact_vector_writer.hpp | 10 ++-- include/builder/streaming_save.hpp | 19 ++++---- src/builder/build.cpp | 6 +-- src/builder/build_sparse_and_skew_index.cpp | 47 ++++++++----------- 7 files changed, 60 insertions(+), 77 deletions(-) diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp index b3e2b34..4cf8d0c 100644 --- a/include/builder/dictionary_builder.hpp +++ b/include/builder/dictionary_builder.hpp @@ -93,9 +93,9 @@ struct dictionary_builder // caller only needs the on-disk index file and wants to keep peak RAM bounded by the build phase. */ - void build_streaming_save(dictionary& d, // - std::string const& filename, // - std::string const& output_filename) // + void build_streaming_save(dictionary& d, // + std::string const& filename, // + std::string const& output_filename) // { run_steps_1_through_7(d, filename); do_step("step 8 (stream-save dictionary to disk)", [&]() { @@ -122,8 +122,8 @@ struct dictionary_builder // /* Skew positions / mphfs: populate the owning_spans with placeholders so the visit walks the right number of entries and we can take their addresses for substitution. */ - const std::size_t num_part = std::max(spilled.skew_positions_paths.size(), - spilled.skew_mphfs_paths.size()); + const std::size_t num_part = + std::max(spilled.skew_positions_paths.size(), spilled.skew_mphfs_paths.size()); if (num_part > 0) { std::vector position_placeholders(num_part); std::vector> mphf_placeholders(num_part); @@ -137,8 +137,7 @@ struct dictionary_builder // } for (std::size_t i = 0; i != spilled.skew_mphfs_paths.size(); ++i) { if (!spilled.skew_mphfs_paths[i].empty()) { - register_sub(subs, &d.m_ssi.ski.mphfs[i], - spilled.skew_mphfs_paths[i]); + register_sub(subs, &d.m_ssi.ski.mphfs[i], spilled.skew_mphfs_paths[i]); } } } @@ -186,8 +185,8 @@ struct dictionary_builder // essentials::loader loader(spilled.codewords_mphf_path.c_str()); loader.visit(d.m_ssi.codewords.mphf); } - const std::size_t num_part = std::max(spilled.skew_positions_paths.size(), - spilled.skew_mphfs_paths.size()); + const std::size_t num_part = + std::max(spilled.skew_positions_paths.size(), spilled.skew_mphfs_paths.size()); if (num_part > 0) { std::vector positions_vec(num_part); std::vector> mphfs_vec(num_part); @@ -351,14 +350,12 @@ struct dictionary_builder // even when subsequent steps fragment the heap, step 5's lingering pages don't blow past the budget when stacked with later step's allocations. */ - const uint64_t buffer_cap_bytes = - (build_config.ram_limit_in_GiB * essentials::GiB) / 8; + const uint64_t buffer_cap_bytes = (build_config.ram_limit_in_GiB * essentials::GiB) / 8; const uint64_t buffer_cap_records = std::max(uint64_t(1) << 16, buffer_cap_bytes / sizeof(minimizer_tuple)); const uint64_t buffer_size_unbounded = - num_files_to_merge == 1 - ? num_super_kmers - : (RAM_available_in_bytes / (3 * sizeof(minimizer_tuple))); + num_files_to_merge == 1 ? num_super_kmers + : (RAM_available_in_bytes / (3 * sizeof(minimizer_tuple))); const uint64_t buffer_size = std::min(buffer_size_unbounded, buffer_cap_records); const uint64_t num_blocks = (num_super_kmers + buffer_size - 1) / buffer_size; assert(num_super_kmers > (num_blocks - 1) * buffer_size); diff --git a/include/builder/disk_backed_offsets_builder.hpp b/include/builder/disk_backed_offsets_builder.hpp index c86d33f..6be3562 100644 --- a/include/builder/disk_backed_offsets_builder.hpp +++ b/include/builder/disk_backed_offsets_builder.hpp @@ -96,9 +96,7 @@ struct disk_backed_offsets_builder { std::string const& filename() const { return m_filename; } /* In-RAM footprint of the builder (excluding the on-disk file). */ - uint64_t num_bytes() const { - return sizeof(m_nb) + m_buf.capacity() * sizeof(uint64_t); - } + uint64_t num_bytes() const { return sizeof(m_nb) + m_buf.capacity() * sizeof(uint64_t); } void set_num_bits(num_bits nb) { m_nb = nb; } @@ -151,8 +149,7 @@ struct disk_backed_offsets_builder { if (!m_in.is_open()) { throw std::runtime_error("cannot open offsets tmp file '" + filename + "'"); } - m_in.seekg(static_cast(start_index * sizeof(uint64_t)), - std::ios::beg); + m_in.seekg(static_cast(start_index * sizeof(uint64_t)), std::ios::beg); refill(); } diff --git a/include/builder/disk_backed_strings.hpp b/include/builder/disk_backed_strings.hpp index dabafe9..d267d4a 100644 --- a/include/builder/disk_backed_strings.hpp +++ b/include/builder/disk_backed_strings.hpp @@ -153,17 +153,17 @@ struct disk_backed_strings { const uint64_t block = bit_pos >> 6; const uint64_t shift = bit_pos & 63; ensure_window_covers(block); - uint64_t a = (block >= m_window_start_word && - block < m_window_start_word + m_window_size) - ? m_window[block - m_window_start_word] - : uint64_t(0); + uint64_t a = + (block >= m_window_start_word && block < m_window_start_word + m_window_size) + ? m_window[block - m_window_start_word] + : uint64_t(0); uint64_t word = a >> shift; if (shift) { const uint64_t next = block + 1; - uint64_t b = (next >= m_window_start_word && - next < m_window_start_word + m_window_size) - ? m_window[next - m_window_start_word] - : uint64_t(0); + uint64_t b = + (next >= m_window_start_word && next < m_window_start_word + m_window_size) + ? m_window[next - m_window_start_word] + : uint64_t(0); word |= b << (64 - shift); } return word; @@ -185,8 +185,7 @@ struct disk_backed_strings { return; } m_in.clear(); // clear any prior eof - m_in.seekg(static_cast(target_word * sizeof(uint64_t)), - std::ios::beg); + m_in.seekg(static_cast(target_word * sizeof(uint64_t)), std::ios::beg); const uint64_t to_read = std::min(m_window_capacity, m_total_words - target_word); m_in.read(reinterpret_cast(m_window.data()), static_cast(to_read * sizeof(uint64_t))); @@ -264,8 +263,8 @@ struct disk_backed_strings { std::vector buffer(uint64_t(64) << 10); // 64 KiB uint64_t bytes_remaining = total_words * sizeof(uint64_t); while (bytes_remaining > 0) { - const std::streamsize chunk = static_cast( - std::min(buffer.size(), bytes_remaining)); + const std::streamsize chunk = + static_cast(std::min(buffer.size(), bytes_remaining)); in.read(buffer.data(), chunk); const std::streamsize got = in.gcount(); if (got <= 0) { diff --git a/include/builder/streaming_compact_vector_writer.hpp b/include/builder/streaming_compact_vector_writer.hpp index ea9dce1..e36e83b 100644 --- a/include/builder/streaming_compact_vector_writer.hpp +++ b/include/builder/streaming_compact_vector_writer.hpp @@ -32,8 +32,10 @@ struct streaming_compact_vector_writer { streaming_compact_vector_writer& operator=(streaming_compact_vector_writer const&) = delete; void open(std::string const& filename, uint64_t num_entries, uint64_t width) { - if (width == 0) throw std::runtime_error("streaming_compact_vector_writer: width must be > 0"); - if (width > 64) throw std::runtime_error("streaming_compact_vector_writer: width must be <= 64"); + if (width == 0) + throw std::runtime_error("streaming_compact_vector_writer: width must be > 0"); + if (width > 64) + throw std::runtime_error("streaming_compact_vector_writer: width must be <= 64"); m_filename = filename; m_num_entries = num_entries; m_width = width; @@ -66,9 +68,7 @@ struct streaming_compact_vector_writer { /* Write a value at position `index`. Successive calls must satisfy `index >= previous_index`; gaps are filled with zero. */ void set(uint64_t index, uint64_t value) { - if (m_have_last_index) { - assert(index >= m_last_index); - } + if (m_have_last_index) { assert(index >= m_last_index); } m_have_last_index = true; m_last_index = index; diff --git a/include/builder/streaming_save.hpp b/include/builder/streaming_save.hpp index 7ccc87c..78db027 100644 --- a/include/builder/streaming_save.hpp +++ b/include/builder/streaming_save.hpp @@ -54,9 +54,9 @@ struct typed_address_sub { recursion via val.visit(*this)). */ struct streaming_strings_saver { - streaming_strings_saver(std::ostream& os, // - bits::bit_vector const* strings_addr, // - disk_backed_strings const* strings_storage, // + streaming_strings_saver(std::ostream& os, // + bits::bit_vector const* strings_addr, // + disk_backed_strings const* strings_storage, // std::unordered_map address_subs) : m_os(os) , m_strings_addr(strings_addr) @@ -149,9 +149,9 @@ struct streaming_strings_saver { standard essentials path. */ template -void save_streaming(T const& t, char const* filename, // - bits::bit_vector const* strings_addr, // - disk_backed_strings const& strings_storage, // +void save_streaming(T const& t, char const* filename, // + bits::bit_vector const* strings_addr, // + disk_backed_strings const& strings_storage, // std::unordered_map address_subs = {}) { std::ofstream out(filename, std::ios::binary); if (!out.good()) { @@ -164,11 +164,10 @@ void save_streaming(T const& t, char const* filename, /* Helper: register a typed substitution at the address of `addr`. */ template -inline void register_sub(std::unordered_map& subs, - T const* addr, std::string filename) { +inline void register_sub(std::unordered_map& subs, T const* addr, + std::string filename) { subs.insert_or_assign(static_cast(addr), - typed_address_sub{std::move(filename), - std::type_index(typeid(T))}); + typed_address_sub{std::move(filename), std::type_index(typeid(T))}); } } // namespace sshash diff --git a/src/builder/build.cpp b/src/builder/build.cpp index f1354d9..0f6210b 100644 --- a/src/builder/build.cpp +++ b/src/builder/build.cpp @@ -49,9 +49,9 @@ void dictionary::build(std::string const& filename, } template -void dictionary::build_streaming_save( - std::string const& input_filename, build_configuration const& build_config, - std::string const& output_filename) // +void dictionary::build_streaming_save(std::string const& input_filename, + build_configuration const& build_config, + std::string const& output_filename) // { build_configuration bc = build_config; validate_and_normalize_build_config(bc, Kmer::max_k, Kmer::max_m); diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp index 30ecdd5..a861093 100644 --- a/src/builder/build_sparse_and_skew_index.cpp +++ b/src/builder/build_sparse_and_skew_index.cpp @@ -14,19 +14,15 @@ namespace sshash { #pragma pack(push, 4) struct kmer_extraction_request { kmer_extraction_request() {} - kmer_extraction_request(uint64_t starting_pos, uint32_t partition_id, - uint32_t pos_in_bucket, uint32_t num_kmers_in_super_kmer) + kmer_extraction_request(uint64_t starting_pos, uint32_t partition_id, uint32_t pos_in_bucket, + uint32_t num_kmers_in_super_kmer) : starting_pos(starting_pos) , partition_id(partition_id) , pos_in_bucket(pos_in_bucket) , num_kmers_in_super_kmer(num_kmers_in_super_kmer) {} - bool operator<(kmer_extraction_request const& o) const { - return starting_pos < o.starting_pos; - } - bool operator>(kmer_extraction_request const& o) const { - return starting_pos > o.starting_pos; - } + bool operator<(kmer_extraction_request const& o) const { return starting_pos < o.starting_pos; } + bool operator>(kmer_extraction_request const& o) const { return starting_pos > o.starting_pos; } static kmer_extraction_request max() { return kmer_extraction_request(uint64_t(-1), uint32_t(-1), uint32_t(-1), uint32_t(-1)); @@ -232,8 +228,7 @@ void dictionary_builder::build_sparse_and_skew_index( std::vector begin_buckets_of_size(min_size + 1, 0); for (uint64_t s = 3; s <= min_size; ++s) { begin_buckets_of_size[s] = static_cast( // - begin_buckets_of_size[s - 1] + - buckets_stats.num_buckets_of_size(s - 1) * (s - 1)); + begin_buckets_of_size[s - 1] + buckets_stats.num_buckets_of_size(s - 1) * (s - 1)); } d.m_ssi.begin_buckets_of_size = std::move(begin_buckets_of_size); @@ -270,8 +265,8 @@ void dictionary_builder::build_sparse_and_skew_index( compact_vector writer to assemble the final mid_load_buckets file. */ auto mid_load_per_size_path = [&](uint64_t s) { std::stringstream ss; - ss << build_config.tmp_dirname << "/sshash.tmp.run_" << step7_run_id - << ".mid_load_size_" << s << ".bin"; + ss << build_config.tmp_dirname << "/sshash.tmp.run_" << step7_run_id << ".mid_load_size_" + << s << ".bin"; return ss.str(); }; std::vector mid_load_per_size(min_size + 1); @@ -295,14 +290,14 @@ void dictionary_builder::build_sparse_and_skew_index( const uint64_t skew_run_id = pthash::clock_type::now().time_since_epoch().count(); auto request_run_filename = [&](uint64_t id) { std::stringstream ss; - ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id - << ".kmer_requests." << id << ".bin"; + ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id << ".kmer_requests." + << id << ".bin"; return ss.str(); }; auto skew_partition_filename = [&](uint64_t pid) { std::stringstream ss; - ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id - << ".skew_kmers." << pid << ".bin"; + ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id << ".skew_kmers." << pid + << ".bin"; return ss.str(); }; @@ -311,10 +306,9 @@ void dictionary_builder::build_sparse_and_skew_index( so heap fragmentation across steps doesn't push peak RSS past the --ram-limit budget. */ std::atomic num_request_runs{0}; - const uint64_t request_buffer_capacity = std::max( - uint64_t(1) << 16, - (build_config.ram_limit_in_GiB * essentials::GiB) / - (8 * sizeof(kmer_extraction_request))); + const uint64_t request_buffer_capacity = + std::max(uint64_t(1) << 16, (build_config.ram_limit_in_GiB * essentials::GiB) / + (8 * sizeof(kmer_extraction_request))); std::vector request_buffer; request_buffer.reserve(request_buffer_capacity); auto flush_request_buffer = [&]() { @@ -424,8 +418,7 @@ void dictionary_builder::build_sparse_and_skew_index( d.m_spss.strings_offsets.decode(mt.pos_in_seq).absolute_offset; const uint64_t starting_pos = abs_offset - mt.pos_in_kmer; if (request_buffer.size() == request_buffer_capacity) flush_request_buffer(); - request_buffer.emplace_back(starting_pos, uint32_t(partition_id), - pos_in_bucket, + request_buffer.emplace_back(starting_pos, uint32_t(partition_id), pos_in_bucket, uint32_t(mt.num_kmers_in_super_kmer)); } } @@ -452,8 +445,7 @@ void dictionary_builder::build_sparse_and_skew_index( { streaming_compact_vector_writer mid_load_writer; mid_load_writer.open(spilled.mid_load_buckets_path, - num_minimizer_positions_of_buckets_larger_than_1, - num_bits_per_offset); + num_minimizer_positions_of_buckets_larger_than_1, num_bits_per_offset); uint64_t global_index = 0; for (uint64_t s = 2; s <= min_size; ++s) { const uint64_t expected = buckets_stats.num_buckets_of_size(s) * s; @@ -518,8 +510,8 @@ void dictionary_builder::build_sparse_and_skew_index( std::string operator*() const { std::stringstream ss; - ss << tmp_dirname << "/sshash.tmp.run_" << skew_run_id - << ".kmer_requests." << i << ".bin"; + ss << tmp_dirname << "/sshash.tmp.run_" << skew_run_id << ".kmer_requests." << i + << ".bin"; return ss.str(); } void operator++() { ++i; } @@ -606,8 +598,7 @@ void dictionary_builder::build_sparse_and_skew_index( mphf_build_config.verbose = false; util::configure_mphf_threads_and_partition(mphf_build_config, build_config.num_threads, build_config.ram_limit_in_GiB, - build_config.verbose, - "skew partition MPHF"); + build_config.verbose, "skew partition MPHF"); mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2; mphf_build_config.tmp_dir = build_config.tmp_dirname; From b3e49c90746b954892cf8719bbd51a4a44e483a4 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 6 May 2026 14:40:45 +0000 Subject: [PATCH 21/32] factor out buffered_record_stream; remove duplicated read loops The build's various ifstream-backed forward readers all followed the same pattern (open file, fixed-capacity record buffer, refill on exhaust, has_next/current/advance interface). Five separate copies of that loop existed: - file_merging_iterator::buffered_stream (per merge run) - disk_backed_offsets_builder::reader (per-thread offsets) - disk_backed_offsets_builder::full_iterator (encode/build) - streaming_minimizers_iterator (codewords MPHF input) - streaming_minimizer_bucket_reader (step 7.1 buckets) - skew_partition_kmer_iterator (skew MPHF input) Hoist the common primitive into a new `include/builder/buffered_record_stream.hpp`: template struct buffered_record_stream { void open(filename, buffer_records, start_byte = 0); void close(); bool empty() const; T const& current() const; void advance(); }; and reuse it from each of the five readers. The pthash-iterator ones (full_iterator, streaming_minimizers_iterator, skew_partition_kmer_iterator) wrap a shared_ptr to keep the copyable-by-value contract pthash expects. While here, define a packed `skew_kmer_record_t` for the phase B/C tmp file format and use it both at the writer and the buffered reader, so the on-disk record layout is in one place. Verified byte-identical SSHash output on salmonella_enterica m=7 and full --check on regular, --canonical, multi-thread (-t 4), and --weighted. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- include/builder/buffered_record_stream.hpp | 106 ++++++++++++++++++ .../builder/disk_backed_offsets_builder.hpp | 92 ++++----------- include/builder/file_merging_iterator.hpp | 68 +---------- include/builder/util.hpp | 83 +++++--------- src/builder/build_sparse_and_skew_index.cpp | 81 ++++++------- 5 files changed, 202 insertions(+), 228 deletions(-) create mode 100644 include/builder/buffered_record_stream.hpp diff --git a/include/builder/buffered_record_stream.hpp b/include/builder/buffered_record_stream.hpp new file mode 100644 index 0000000..382505c --- /dev/null +++ b/include/builder/buffered_record_stream.hpp @@ -0,0 +1,106 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace sshash { + +/* + A small buffered, forward-only reader of fixed-size records over a + binary file. Records are read in fixed-capacity chunks (~`buffer_records + * sizeof(T)` bytes of RAM) so the per-instance footprint is bounded + independently of the file size. + + Used as the underlying primitive by all of SSHash's builder readers + over on-disk record files (minimizer tuples, kmer requests, kmer + records, offset values, sorted-run records, etc.). The class is + move-only; for callers that need a copyable forward iterator (e.g. + pthash's `build_in_external_memory`, which takes an iterator by + value), wrap an instance in a `std::shared_ptr`. + + Usage: + buffered_record_stream s; + s.open(filename); + while (!s.empty()) { + consume(s.current()); + s.advance(); + } + s.close(); +*/ +template +struct buffered_record_stream { + static constexpr uint64_t default_buffer_records = 4096; + + buffered_record_stream() = default; + buffered_record_stream(buffered_record_stream const&) = delete; + buffered_record_stream& operator=(buffered_record_stream const&) = delete; + buffered_record_stream(buffered_record_stream&&) = default; + buffered_record_stream& operator=(buffered_record_stream&&) = default; + + /* Open `filename` for forward reading; optionally seek to byte + `start_byte` before priming the read window. */ + void open(std::string const& filename, + uint64_t buffer_records = default_buffer_records, + std::streamoff start_byte = 0) { + m_buf.resize(std::max(1, buffer_records)); + m_in.open(filename, std::ifstream::binary); + if (!m_in.is_open()) { + throw std::runtime_error("cannot open file '" + filename + "'"); + } + if (start_byte != 0) m_in.seekg(start_byte, std::ios::beg); + m_pos = 0; + m_size = 0; + m_eof = false; + refill(); + } + + void close() { + if (m_in.is_open()) m_in.close(); + m_buf.clear(); + m_buf.shrink_to_fit(); + m_pos = 0; + m_size = 0; + m_eof = true; + } + + bool is_open() const { return m_in.is_open(); } + + /* True iff there are no more records in the stream. */ + bool empty() const { return m_pos >= m_size; } + + /* Reference to the current record. Valid until the next `advance()`. */ + T const& current() const { + assert(!empty()); + return m_buf[m_pos]; + } + + /* Move to the next record; refills the buffer from disk on demand. */ + void advance() { + assert(!empty()); + ++m_pos; + if (m_pos >= m_size && !m_eof) refill(); + } + +private: + std::ifstream m_in; + std::vector m_buf; + uint64_t m_pos = 0; + uint64_t m_size = 0; + bool m_eof = true; + + void refill() { + m_pos = 0; + m_in.read(reinterpret_cast(m_buf.data()), + static_cast(m_buf.size() * sizeof(T))); + const std::streamsize got = m_in.gcount(); + m_size = static_cast(got) / sizeof(T); + if (m_size == 0) m_eof = true; + } +}; + +} // namespace sshash diff --git a/include/builder/disk_backed_offsets_builder.hpp b/include/builder/disk_backed_offsets_builder.hpp index 6be3562..ff1a24c 100644 --- a/include/builder/disk_backed_offsets_builder.hpp +++ b/include/builder/disk_backed_offsets_builder.hpp @@ -10,6 +10,7 @@ #include #include +#include "include/builder/buffered_record_stream.hpp" #include "include/offsets.hpp" namespace sshash { @@ -130,61 +131,34 @@ struct disk_backed_offsets_builder { Forward-sequential reader over the offsets file. Each thread in compute_minimizer_tuples should construct one for its assigned index range; per-thread RAM footprint is the buffer size only. + Built on top of `buffered_record_stream`. */ struct reader { reader() = default; - reader(reader const&) = delete; - reader& operator=(reader const&) = delete; - reader(reader&&) = default; - reader& operator=(reader&&) = default; /* Open the file and seek so that the next `next()` call returns `*(values + start_index)`. */ void open(std::string const& filename, uint64_t start_index, uint64_t buffer_records = default_reader_buffer_records) { - m_buf.assign(std::max(1, buffer_records), 0); - m_pos = 0; - m_size = 0; - m_in.open(filename, std::ifstream::binary); - if (!m_in.is_open()) { - throw std::runtime_error("cannot open offsets tmp file '" + filename + "'"); - } - m_in.seekg(static_cast(start_index * sizeof(uint64_t)), std::ios::beg); - refill(); + m_stream.open(filename, buffer_records, + static_cast(start_index * sizeof(uint64_t))); } - void close() { - if (m_in.is_open()) m_in.close(); - m_buf.clear(); - m_buf.shrink_to_fit(); - m_pos = 0; - m_size = 0; - } + void close() { m_stream.close(); } /* Return the next offset and advance. Caller must ensure they don't read past the end of the file. */ uint64_t next() { - if (m_pos >= m_size) refill(); - assert(m_pos < m_size); - return m_buf[m_pos++]; - } - - private: - std::ifstream m_in; - std::vector m_buf; - uint64_t m_pos = 0; - uint64_t m_size = 0; - - void refill() { - m_pos = 0; - m_in.read(reinterpret_cast(m_buf.data()), - static_cast(m_buf.size() * sizeof(uint64_t))); - const std::streamsize got = m_in.gcount(); - m_size = static_cast(got) / sizeof(uint64_t); - if (m_size == 0) { + if (m_stream.empty()) { throw std::runtime_error("disk_backed_offsets_builder: read past end of file"); } + const uint64_t v = m_stream.current(); + m_stream.advance(); + return v; } + + private: + buffered_record_stream m_stream; }; /* Construct a reader positioned at `start_index`. Requires freeze(). */ @@ -202,9 +176,10 @@ struct disk_backed_offsets_builder { /* A copyable forward iterator over the entire offsets file, suitable for the `Iterator`-template `encode` / `build` calls in - `bits::endpoints_sequence` and `bits::compact_vector`. Holds the - underlying ifstream via shared_ptr so the iterator can be copied - (those APIs may copy the iterator internally). + `bits::endpoints_sequence` and `bits::compact_vector`. Wraps a + shared_ptr> so the iterator is + copyable; copies share the underlying stream state, which is what + those APIs expect. */ struct full_iterator { using iterator_category = std::forward_iterator_tag; @@ -217,43 +192,22 @@ struct disk_backed_offsets_builder { void open(std::string const& filename, uint64_t buffer_records = default_reader_buffer_records) { - m_state = std::make_shared(); - m_state->buf.assign(std::max(1, buffer_records), 0); - m_state->in.open(filename, std::ifstream::binary); - if (!m_state->in.is_open()) { - throw std::runtime_error("cannot open offsets tmp file '" + filename + "'"); - } - m_state->refill(); + m_stream = std::make_shared>(); + m_stream->open(filename, buffer_records); } uint64_t operator*() const { - assert(m_state && m_state->pos < m_state->size); - return m_state->buf[m_state->pos]; + assert(m_stream); + return m_stream->current(); } full_iterator& operator++() { - assert(m_state); - ++m_state->pos; - if (m_state->pos >= m_state->size && !m_state->eof) m_state->refill(); + assert(m_stream); + m_stream->advance(); return *this; } private: - struct state { - std::ifstream in; - std::vector buf; - uint64_t pos = 0; - uint64_t size = 0; - bool eof = false; - void refill() { - pos = 0; - in.read(reinterpret_cast(buf.data()), - static_cast(buf.size() * sizeof(uint64_t))); - const std::streamsize got = in.gcount(); - size = static_cast(got) / sizeof(uint64_t); - if (size == 0) eof = true; - } - }; - std::shared_ptr m_state; + std::shared_ptr> m_stream; }; /* diff --git a/include/builder/file_merging_iterator.hpp b/include/builder/file_merging_iterator.hpp index ff191ee..37cf24c 100644 --- a/include/builder/file_merging_iterator.hpp +++ b/include/builder/file_merging_iterator.hpp @@ -7,6 +7,7 @@ #include #include +#include "buffered_record_stream.hpp" #include "util.hpp" namespace sshash { @@ -79,70 +80,9 @@ struct file_merging_iterator // } private: - /* - A buffered, forward-only reader over a single run file. Reads in - chunks of `m_buf.size()` records via std::ifstream and presents a - T-by-reference current-value interface. - */ - struct buffered_stream { - buffered_stream() = default; - buffered_stream(buffered_stream const&) = delete; - buffered_stream& operator=(buffered_stream const&) = delete; - buffered_stream(buffered_stream&&) = default; - buffered_stream& operator=(buffered_stream&&) = default; - - void open(std::string const& filename, uint64_t buffer_records) { - m_buf.resize(std::max(1, buffer_records)); - m_in.open(filename, std::ifstream::binary); - if (!m_in.is_open()) { - throw std::runtime_error("cannot open run file '" + filename + "'"); - } - m_pos = 0; - m_size = 0; - m_eof = false; - refill(); - } - - void close() { - if (m_in.is_open()) m_in.close(); - m_buf.clear(); - m_buf.shrink_to_fit(); - m_pos = 0; - m_size = 0; - m_eof = true; - } - - bool empty() const { return m_pos >= m_size; } - - T const& current() const { - assert(!empty()); - return m_buf[m_pos]; - } - - void advance() { - assert(!empty()); - ++m_pos; - if (m_pos >= m_size && !m_eof) refill(); - } - - private: - std::ifstream m_in; - std::vector m_buf; - uint64_t m_pos = 0; - uint64_t m_size = 0; - bool m_eof = true; - - void refill() { - m_pos = 0; - m_in.read(reinterpret_cast(m_buf.data()), - static_cast(m_buf.size() * sizeof(T))); - const std::streamsize got = m_in.gcount(); - m_size = static_cast(got) / sizeof(T); - if (m_size == 0) m_eof = true; - } - }; - - std::vector m_streams; + /* Each input run is read via a small buffered ifstream. */ + using stream_t = buffered_record_stream; + std::vector m_streams; std::vector m_tree; uint64_t m_begin = 0, m_size = 0; diff --git a/include/builder/util.hpp b/include/builder/util.hpp index bd57038..72ebd21 100644 --- a/include/builder/util.hpp +++ b/include/builder/util.hpp @@ -5,6 +5,7 @@ #include #include +#include "buffered_record_stream.hpp" #include "file_merging_iterator.hpp" #include "parallel_sort.hpp" @@ -159,12 +160,14 @@ struct minimizers_tuples_iterator { Streaming forward iterator over a sorted minimizers tmp file that yields each distinct `minimizer` value exactly once (i.e., one value per bucket). Equivalent to `minimizers_tuples_iterator` over an mmap'd - buffer, but reads from std::ifstream so RAM usage is constant. + buffer, but built on top of `buffered_record_stream` + so RAM usage is constant. Copyable: pthash's `build_in_external_memory` takes the iterator by - value, so the underlying ifstream is held via shared_ptr. Copies share - the stream state; pthash's local copy advances the shared stream, and - the original at the call site is unused after the build returns. + value, so the underlying buffered stream is held via shared_ptr. + Copies share the stream state; pthash's local copy advances the + shared stream, and the original at the call site is unused after the + build returns. */ struct streaming_minimizers_iterator { using iterator_category = std::forward_iterator_tag; @@ -176,25 +179,14 @@ struct streaming_minimizers_iterator { streaming_minimizers_iterator() = default; void open(std::string const& filename) { - m_in = std::make_shared(filename, std::ifstream::binary); - if (!m_in->is_open()) { - throw std::runtime_error("cannot open minimizers tmp file '" + filename + "'"); - } - m_eof = false; - m_current = uint64_t(-1); - // Bootstrap: read the first tuple. - minimizer_tuple t; - m_in->read(reinterpret_cast(&t), sizeof(minimizer_tuple)); - if (m_in->gcount() != static_cast(sizeof(minimizer_tuple))) { - m_eof = true; - return; - } - m_current = t.minimizer; + m_stream = std::make_shared>(); + m_stream->open(filename); + m_current = m_stream->empty() ? uint64_t(-1) : m_stream->current().minimizer; } void close() { - if (m_in && m_in->is_open()) m_in->close(); - m_in.reset(); + if (m_stream) m_stream->close(); + m_stream.reset(); } uint64_t operator*() const { return m_current; } @@ -204,21 +196,16 @@ struct streaming_minimizers_iterator { } private: - std::shared_ptr m_in; + std::shared_ptr> m_stream; uint64_t m_current = uint64_t(-1); - bool m_eof = true; void advance_to_next_minimizer() { const uint64_t prev = m_current; - minimizer_tuple t; - while (true) { - m_in->read(reinterpret_cast(&t), sizeof(minimizer_tuple)); - if (m_in->gcount() != static_cast(sizeof(minimizer_tuple))) { - m_eof = true; - return; // m_current holds last value; pthash has consumed `num_minimizers` keys - } - if (t.minimizer != prev) { - m_current = t.minimizer; + while (!m_stream->empty()) { + m_stream->advance(); + if (m_stream->empty()) return; // m_current holds last value + if (m_stream->current().minimizer != prev) { + m_current = m_stream->current().minimizer; return; } } @@ -236,43 +223,27 @@ struct streaming_minimizers_iterator { inputs this peaks at max_bucket_size * sizeof(minimizer_tuple). */ struct streaming_minimizer_bucket_reader { - void open(std::string const& filename) { - m_in.open(filename, std::ifstream::binary); - if (!m_in.is_open()) { - throw std::runtime_error("cannot open minimizers tmp file '" + filename + "'"); - } - // Read first record into the lookahead slot, if any. - m_in.read(reinterpret_cast(&m_lookahead), sizeof(minimizer_tuple)); - m_eof = (m_in.gcount() != static_cast(sizeof(minimizer_tuple))); - } + void open(std::string const& filename) { m_stream.open(filename); } - void close() { - if (m_in.is_open()) m_in.close(); - } + void close() { m_stream.close(); } - bool has_next_bucket() const { return !m_eof; } + bool has_next_bucket() const { return !m_stream.empty(); } /* Read the next bucket into `bucket_out` (cleared first). All tuples in a bucket share the same minimizer. Returns the bucket's minimizer. */ uint64_t next_bucket(std::vector& bucket_out) { bucket_out.clear(); - assert(!m_eof); - const uint64_t mm = m_lookahead.minimizer; + assert(has_next_bucket()); + const uint64_t mm = m_stream.current().minimizer; do { - bucket_out.push_back(m_lookahead); - m_in.read(reinterpret_cast(&m_lookahead), sizeof(minimizer_tuple)); - if (m_in.gcount() != static_cast(sizeof(minimizer_tuple))) { - m_eof = true; - break; - } - } while (m_lookahead.minimizer == mm); + bucket_out.push_back(m_stream.current()); + m_stream.advance(); + } while (!m_stream.empty() && m_stream.current().minimizer == mm); return mm; } private: - std::ifstream m_in; - minimizer_tuple m_lookahead; - bool m_eof = true; + buffered_record_stream m_stream; }; struct minimizers_tuples { diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp index a861093..5bf5636 100644 --- a/src/builder/build_sparse_and_skew_index.cpp +++ b/src/builder/build_sparse_and_skew_index.cpp @@ -56,17 +56,30 @@ struct position_tuple { #pragma pack(pop) /* - Forward iterator over a per-skew-partition tmp file produced by step - 7.2 phase (B). Each record is `(kmer.bits, uint32_t pos_in_bucket)`. - This iterator yields successive Kmer values, exposing the minimal - interface (`*it`, `++it`) that pthash's external-memory partitioned PHF - builder consumes. + Per-skew-partition tmp file record (written by step 7.2 phase (B), + consumed by phase (C)): a kmer's bit pattern + the pos_in_bucket + we'll later pack into the partition's positions compact_vector. +*/ +#pragma pack(push, 4) +template +struct skew_kmer_record_t { + using kmer_bits_t = decltype(Kmer{}.bits); + kmer_bits_t kmer_bits; + uint32_t pib; +}; +#pragma pack(pop) + +/* + Forward iterator over a per-skew-partition tmp file produced by phase + (B). Yields successive Kmer values via the minimal interface (`*it`, + `++it`) that pthash's external-memory partitioned PHF builder + consumes. pthash takes the iterator by value, so it must be copyable. The - underlying `ifstream` is held via `shared_ptr` and shared between - copies; pthash's copy advances the shared stream state, which is fine - because the original at the call site is no longer used after the - build call returns. + underlying buffered_record_stream is held via shared_ptr and shared + between copies; pthash's copy advances the shared stream state, which + is fine because the original at the call site is unused after the + build returns. */ template struct skew_partition_kmer_iterator { @@ -79,36 +92,28 @@ struct skew_partition_kmer_iterator { skew_partition_kmer_iterator() = default; void open(std::string const& filename) { - m_in = std::make_shared(filename, std::ifstream::binary); - if (!m_in->is_open()) { - throw std::runtime_error("cannot open skew-partition tmp file '" + filename + "'"); - } - advance(); + m_stream = std::make_shared>>(); + m_stream->open(filename); + if (!m_stream->empty()) m_current.bits = m_stream->current().kmer_bits; } void close() { - if (m_in && m_in->is_open()) m_in->close(); - m_in.reset(); + if (m_stream) m_stream->close(); + m_stream.reset(); } Kmer const& operator*() const { return m_current; } skew_partition_kmer_iterator& operator++() { - advance(); + if (!m_stream->empty()) { + m_stream->advance(); + if (!m_stream->empty()) m_current.bits = m_stream->current().kmer_bits; + } return *this; } private: - std::shared_ptr m_in; + std::shared_ptr>> m_stream; Kmer m_current; - - void advance() { - decltype(Kmer{}.bits) bits; - m_in->read(reinterpret_cast(&bits), sizeof(bits)); - if (m_in->gcount() != static_cast(sizeof(bits))) return; - uint32_t pib; - m_in->read(reinterpret_cast(&pib), sizeof(pib)); // skip pos_in_bucket - m_current.bits = bits; - } }; template @@ -551,9 +556,8 @@ void dictionary_builder::build_sparse_and_skew_index( kmer = std::min(kmer, kmer_rc); } auto& w = partition_writers[req.partition_id]; - w.write(reinterpret_cast(&kmer.bits), sizeof(kmer.bits)); - w.write(reinterpret_cast(&req.pos_in_bucket), - sizeof(req.pos_in_bucket)); + skew_kmer_record_t rec{kmer.bits, req.pos_in_bucket}; + w.write(reinterpret_cast(&rec), sizeof(rec)); kmer_it.next(); } merger.next(); @@ -703,20 +707,19 @@ void dictionary_builder::build_sparse_and_skew_index( pos_buffer.clear(); }; - std::ifstream in(kmer_fn, std::ifstream::binary); - if (!in.is_open()) { - throw std::runtime_error("cannot open skew-partition tmp file"); - } + buffered_record_stream> rec_stream; + rec_stream.open(kmer_fn); for (uint64_t i = 0; i != n; ++i) { + assert(!rec_stream.empty()); + auto const& rec = rec_stream.current(); Kmer kmer; - in.read(reinterpret_cast(&kmer.bits), sizeof(kmer.bits)); - uint32_t pib; - in.read(reinterpret_cast(&pib), sizeof(pib)); + kmer.bits = rec.kmer_bits; const uint64_t pos = F(kmer); if (pos_buffer.size() == pos_buffer_capacity) flush_pos_buffer(); - pos_buffer.emplace_back(pos, pib); + pos_buffer.emplace_back(pos, rec.pib); + rec_stream.advance(); } - in.close(); + rec_stream.close(); std::remove(kmer_fn.c_str()); flush_pos_buffer(); } From a35c36458ee040f2a2c0caecd084e255cc5565a1 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 6 May 2026 14:41:01 +0000 Subject: [PATCH 22/32] build: add --no-streaming-save flag for in-RAM save path Currently the build CLI picks between two paths: -o without --check -> dictionary::build_streaming_save (spilled components are stitched into the output via the streaming saver; `dict` is not query-ready afterward) -o with --check (or no -o) -> dictionary::build (spilled components are materialized back into `dict`, then optionally essentials::save) For users with plenty of RAM who don't want the streaming-save tmp-file concatenation (and don't need --check), expose the in-RAM save path explicitly via --no-streaming-save. When set, the build does build() + essentials::save: peak RSS at save time briefly equals the in-RAM index size, but the save is a single pass over `dict` rather than a stitched concatenation. Useful when the user already pays the memory cost (e.g., to query the dict immediately afterward in another tool, or just prefers the simpler save path). Both flows produce byte-identical output files; the flag only affects the save path. https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- tools/build.cpp | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/tools/build.cpp b/tools/build.cpp index fbc7d1d..1a530c0 100644 --- a/tools/build.cpp +++ b/tools/build.cpp @@ -46,6 +46,13 @@ int build(int argc, char** argv) { true); parser.add("check", "Check correctness after construction.", "--check", false, true); parser.add("verbose", "Verbose output during construction.", "--verbose", false, true); + parser.add("no_streaming_save", + "Force the in-RAM save path even with -o: build, materialize the dictionary in RAM, " + "then write it via essentials::save. Peak RSS at save time briefly equals the " + "in-RAM index size; useful when the user has plenty of memory and wants a single " + "save call rather than the streaming-save tmp-file concatenation. Implied by " + "--check (which always materializes for query).", + "--no-streaming-save", false, true); if (!parser.parse()) return 0; @@ -74,19 +81,28 @@ int build(int argc, char** argv) { // build_config.print(); bool check = parser.get("check"); + bool no_streaming_save = parser.get("no_streaming_save"); bool has_output = parser.parsed("output_filename"); dictionary_type dict; - if (has_output && !check) { + if (has_output && !check && !no_streaming_save) { /* Streaming-save path: keeps peak RAM bounded by the build phase - (the strings bit-vector is never fully in RAM). After this returns - `dict` is not query-ready; reload from disk to query. */ + (the strings bit-vector and the spilled compact_vectors / MPHFs + are never fully in RAM). After this returns `dict` is not + query-ready; reload from disk to query. */ auto output_filename = parser.get("output_filename"); essentials::logger("building data structure (streaming save)..."); dict.build_streaming_save(input_filename, build_config, output_filename); essentials::logger("DONE"); } else { + /* In-RAM save path. The build still spills internally for + bounded-RAM construction, but at the end every spilled + component is materialized back into `dict` so it's + query-ready. Used whenever --check is requested (queries need + `dict` populated) or when the user explicitly opts in via + --no-streaming-save. Peak RSS briefly hits the full index + size at save time. */ essentials::logger("building data structure..."); dict.build(input_filename, build_config); From d7dc21dcc4655dcdf3b81cd50dbb967140ba6b3e Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 6 May 2026 16:07:51 +0000 Subject: [PATCH 23/32] Revert "build: add --no-streaming-save flag for in-RAM save path" This reverts commit a35c36458ee040f2a2c0caecd084e255cc5565a1. --- tools/build.cpp | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/tools/build.cpp b/tools/build.cpp index 1a530c0..fbc7d1d 100644 --- a/tools/build.cpp +++ b/tools/build.cpp @@ -46,13 +46,6 @@ int build(int argc, char** argv) { true); parser.add("check", "Check correctness after construction.", "--check", false, true); parser.add("verbose", "Verbose output during construction.", "--verbose", false, true); - parser.add("no_streaming_save", - "Force the in-RAM save path even with -o: build, materialize the dictionary in RAM, " - "then write it via essentials::save. Peak RSS at save time briefly equals the " - "in-RAM index size; useful when the user has plenty of memory and wants a single " - "save call rather than the streaming-save tmp-file concatenation. Implied by " - "--check (which always materializes for query).", - "--no-streaming-save", false, true); if (!parser.parse()) return 0; @@ -81,28 +74,19 @@ int build(int argc, char** argv) { // build_config.print(); bool check = parser.get("check"); - bool no_streaming_save = parser.get("no_streaming_save"); bool has_output = parser.parsed("output_filename"); dictionary_type dict; - if (has_output && !check && !no_streaming_save) { + if (has_output && !check) { /* Streaming-save path: keeps peak RAM bounded by the build phase - (the strings bit-vector and the spilled compact_vectors / MPHFs - are never fully in RAM). After this returns `dict` is not - query-ready; reload from disk to query. */ + (the strings bit-vector is never fully in RAM). After this returns + `dict` is not query-ready; reload from disk to query. */ auto output_filename = parser.get("output_filename"); essentials::logger("building data structure (streaming save)..."); dict.build_streaming_save(input_filename, build_config, output_filename); essentials::logger("DONE"); } else { - /* In-RAM save path. The build still spills internally for - bounded-RAM construction, but at the end every spilled - component is materialized back into `dict` so it's - query-ready. Used whenever --check is requested (queries need - `dict` populated) or when the user explicitly opts in via - --no-streaming-save. Peak RSS briefly hits the full index - size at save time. */ essentials::logger("building data structure..."); dict.build(input_filename, build_config); From d8328511d3a59ecae62276986fd945aa8a9f7769 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 6 May 2026 16:38:37 +0000 Subject: [PATCH 24/32] finalize_stats: print total bits/kmer also in streaming-save flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the streaming-save flow skipped print_space_breakdown (d.m_spss.strings is an empty placeholder there, so calling breakdown would just print zeros), which meant the user got no size summary at the end. Master always prints "total ... bits/kmer" though. Stat the saved file's size when `d` isn't materialized and print the total directly. The in-RAM flow keeps the existing per-component breakdown unchanged. Cost is one fstat at the very end of the build. Also fixes index_size_in_bytes in the JSON-line build_stats output for the streaming-save flow (it used to report just the in-RAM-resident bytes ≈ a few hundred MB instead of the actual on-disk index size). https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK --- include/builder/dictionary_builder.hpp | 32 +++++++++++++++++++++----- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp index 4cf8d0c..4475bb6 100644 --- a/include/builder/dictionary_builder.hpp +++ b/include/builder/dictionary_builder.hpp @@ -146,7 +146,7 @@ struct dictionary_builder // strings_builder.remove_file(); spilled.clear_files(); }); - finalize_stats(d); + finalize_stats(d, output_filename); } build_configuration build_config; @@ -272,16 +272,36 @@ struct dictionary_builder // }); } - void finalize_stats(dictionary& d) { + void finalize_stats(dictionary& d, std::string const& saved_path = "") { + /* For the materialize-to-RAM flow `d` is fully populated and we + can call `d.print_space_breakdown()` / `d.num_bits()` directly. + For the streaming-save flow `d`'s spilled components are empty + placeholders, so we read the on-disk index file's size for the + total — this is just a stat, no recomputation. */ + const bool d_is_populated = d.m_spss.strings.num_bits() > 0; + uint64_t num_bytes = 0; + if (d_is_populated) { + num_bytes = (d.num_bits() + 7) / 8; + } else if (!saved_path.empty()) { + std::ifstream f(saved_path, std::ios::binary | std::ios::ate); + if (f.is_open()) num_bytes = static_cast(f.tellg()); + } + if (build_config.verbose) { print_time(total_time_musec, "total time"); - /* `print_space_breakdown` reads d.m_spss.strings; only safe in - the materialize-to-RAM flow. */ - if (d.m_spss.strings.num_bits() > 0) d.print_space_breakdown(); + if (d_is_populated) { + d.print_space_breakdown(); + } else if (num_bytes > 0) { + std::cout << "total index size: " << num_bytes << " [B] -- " + << essentials::convert(num_bytes, essentials::MB) << " [MB]\n"; + std::cout << " total: " + << (num_kmers > 0 ? (8.0 * num_bytes) / num_kmers : 0.0) + << " [bits/kmer]" << std::endl; + } } build_stats.add("total_build_time_in_microsec", total_time_musec); - build_stats.add("index_size_in_bytes", (d.num_bits() + 7) / 8); + build_stats.add("index_size_in_bytes", num_bytes); build_stats.add("num_kmers", d.num_kmers()); if (build_config.verbose) build_stats.print(); From 030f1d0774added90c5ce5fe96087cbc59c3851a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 6 May 2026 16:49:55 +0000 Subject: [PATCH 25/32] build_stats: format step timings as seconds with [sec] unit All build-step durations in the JSON-line build_stats output were raw microseconds, which was hard to read. Convert them to "X.XXX [sec]" via a small `musec_as_seconds_str` helper. Steps 7.1 and 7.2 are added directly inside build_sparse_and_skew_index.cpp, so apply the same helper there too. Also switch finalize_stats to std::filesystem::file_size for the saved index size, instead of an fstream + tellg. --- include/builder/dictionary_builder.hpp | 23 ++++++++++++++++----- src/builder/build_sparse_and_skew_index.cpp | 6 ++++-- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp index 4475bb6..49e8589 100644 --- a/include/builder/dictionary_builder.hpp +++ b/include/builder/dictionary_builder.hpp @@ -1,5 +1,7 @@ #pragma once +#include +#include #include #include "essentials.hpp" @@ -277,14 +279,16 @@ struct dictionary_builder // can call `d.print_space_breakdown()` / `d.num_bits()` directly. For the streaming-save flow `d`'s spilled components are empty placeholders, so we read the on-disk index file's size for the - total — this is just a stat, no recomputation. */ + total via `std::filesystem::file_size` — direct OS stat, no + recomputation. */ const bool d_is_populated = d.m_spss.strings.num_bits() > 0; uint64_t num_bytes = 0; if (d_is_populated) { num_bytes = (d.num_bits() + 7) / 8; } else if (!saved_path.empty()) { - std::ifstream f(saved_path, std::ios::binary | std::ios::ate); - if (f.is_open()) num_bytes = static_cast(f.tellg()); + std::error_code ec; + const auto sz = std::filesystem::file_size(saved_path, ec); + if (!ec) num_bytes = static_cast(sz); } if (build_config.verbose) { @@ -300,7 +304,7 @@ struct dictionary_builder // } } - build_stats.add("total_build_time_in_microsec", total_time_musec); + build_stats.add("total_build_time", musec_as_seconds_str(total_time_musec).c_str()); build_stats.add("index_size_in_bytes", num_bytes); build_stats.add("num_kmers", d.num_kmers()); @@ -312,6 +316,15 @@ struct dictionary_builder // << (time_in_musec * 1000) / num_kmers << " [ns/kmer])" << std::endl; } + /* Format a microsecond count as e.g. "7.292 [sec]" for the JSON-line + build_stats output. Three decimals = millisecond precision, which is + both compact and plenty precise for build-step durations. */ + static std::string musec_as_seconds_str(uint64_t musec) { + char buf[64]; + std::snprintf(buf, sizeof(buf), "%.3f [sec]", static_cast(musec) / 1.0e6); + return std::string(buf); + } + template void do_step(std::string const& step, Callback const& f) { timer.start(); @@ -320,7 +333,7 @@ struct dictionary_builder // uint64_t step_elapsed_time_musec = timer.elapsed(); total_time_musec += step_elapsed_time_musec; if (build_config.verbose) print_time(step_elapsed_time_musec, step); - build_stats.add(step, step_elapsed_time_musec); + build_stats.add(step, musec_as_seconds_str(step_elapsed_time_musec).c_str()); timer.reset(); } diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp index 5bf5636..e7c1a17 100644 --- a/src/builder/build_sparse_and_skew_index.cpp +++ b/src/builder/build_sparse_and_skew_index.cpp @@ -474,7 +474,8 @@ void dictionary_builder::build_sparse_and_skew_index( } timer.stop(); - build_stats.add("step 7.1 (build sparse index)", uint64_t(timer.elapsed())); + build_stats.add("step 7.1 (build sparse index)", + musec_as_seconds_str(uint64_t(timer.elapsed())).c_str()); if (build_config.verbose) { print_time(uint64_t(timer.elapsed()), "step 7.1 (build sparse index)"); } @@ -791,7 +792,8 @@ void dictionary_builder::build_sparse_and_skew_index( timer.stop(); - build_stats.add("step 7.2 (build skew index)", uint64_t(timer.elapsed())); + build_stats.add("step 7.2 (build skew index)", + musec_as_seconds_str(uint64_t(timer.elapsed())).c_str()); if (build_config.verbose) { print_time(uint64_t(timer.elapsed()), "step 7.2 (build skew index)"); From 659b9561010bc72145b6274bcc58cdba63276171 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 6 May 2026 16:57:57 +0000 Subject: [PATCH 26/32] remove dead bucket_type / minimizers_tuples_iterator These two structs in include/builder/util.hpp were the in-RAM iterator types that walked an mmap'd minimizer-tuples buffer. The build pipeline now reads minimizers via streaming_minimizers_iterator and streaming_minimizer_bucket_reader (both built on buffered_record_stream), and nothing else referenced bucket_type or minimizers_tuples_iterator. Also drop the now-stale comment references to minimizers_tuples_iterator in the surviving streaming iterators' docstrings. --- include/builder/dictionary_builder.hpp | 3 +- include/builder/util.hpp | 103 +------------------------ 2 files changed, 4 insertions(+), 102 deletions(-) diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp index 49e8589..e8cf02c 100644 --- a/include/builder/dictionary_builder.hpp +++ b/include/builder/dictionary_builder.hpp @@ -345,8 +345,7 @@ struct dictionary_builder // void build_mphf(dictionary& d) { const uint64_t num_minimizers = minimizers.num_minimizers(); /* Stream minimizers from disk via std::ifstream (no mmap); the - iterator yields each distinct minimizer once, matching what - `minimizers_tuples_iterator` did over the mmap'd file. */ + iterator yields each distinct minimizer once. */ streaming_minimizers_iterator iterator; iterator.open(minimizers.get_minimizers_filename()); d.m_ssi.codewords.build(iterator, num_minimizers, build_config); diff --git a/include/builder/util.hpp b/include/builder/util.hpp index 72ebd21..94761b9 100644 --- a/include/builder/util.hpp +++ b/include/builder/util.hpp @@ -61,106 +61,10 @@ inline std::ostream& operator<<(std::ostream& os, minimizer_tuple const& mt) { return os; } -struct bucket_type { - bucket_type(minimizer_tuple const* begin, minimizer_tuple const* end) - : m_begin(begin), m_end(end) {} - - struct iterator { - iterator(minimizer_tuple const* begin) : m_begin(begin) {} - - inline minimizer_tuple operator*() const { return *m_begin; } - inline void operator++() { ++m_begin; } - bool operator==(iterator const& other) const { return m_begin == other.m_begin; } - bool operator!=(iterator const& other) const { return !(*this == other); } - - private: - minimizer_tuple const* m_begin; - }; - - iterator begin() const { return iterator(m_begin); } - iterator end() const { return iterator(m_end); } - - /* - When a canonical index is built (option `--canonical`), - a minimizer offset can correspond to more than one super-kmer. - A super-kmer is uniquely identified by the couple - (minimizer offset, position of minimizer in the first kmer of the super-kmer). - These two components, together, give the - starting position of a super-kmer in the sequence. - - So the method size() returns the number of minimizer - positions which is <= the number of superkmers. - */ - - uint64_t num_super_kmers() const { return std::distance(m_begin, m_end); } - - uint64_t size() const { - uint64_t num_minimizer_positions = 0; - uint64_t prev_pos_in_seq = constants::invalid_uint64; - auto const* begin = m_begin; - while (begin != m_end) { - uint64_t pos_in_seq = (*begin).pos_in_seq; - if (pos_in_seq != prev_pos_in_seq) { - ++num_minimizer_positions; - prev_pos_in_seq = pos_in_seq; - } - ++begin; - } - assert(num_minimizer_positions <= num_super_kmers()); - return num_minimizer_positions; - } - - minimizer_tuple const* begin_ptr() const { return m_begin; } - minimizer_tuple const* end_ptr() const { return m_end; } - -private: - minimizer_tuple const* m_begin; - minimizer_tuple const* m_end; -}; - -/* - Iterate over the "bucket" of a minimizer, i.e., - the sorted list of minimizer tuples - (minimizer, pos_in_seq, pos_in_kmer, num_kmers_in_superkmer). -*/ -struct minimizers_tuples_iterator { - typedef minimizer_tuple value_type; - using iterator_category = std::forward_iterator_tag; - - minimizers_tuples_iterator(minimizer_tuple const* begin, minimizer_tuple const* end) - : m_bucket_begin(begin), m_bucket_end(begin), m_end(end) { - m_bucket_end = next_begin(); - } - - inline uint64_t minimizer() const { return (*m_bucket_begin).minimizer; } - inline uint64_t operator*() const { return minimizer(); } - inline void next() { - m_bucket_begin = m_bucket_end; - m_bucket_end = next_begin(); - } - inline void operator++() { next(); } - bool has_next() const { return m_bucket_begin != m_end; } - bucket_type bucket() const { return bucket_type(m_bucket_begin, m_bucket_end); } - -private: - minimizer_tuple const* m_bucket_begin; - minimizer_tuple const* m_bucket_end; - minimizer_tuple const* m_end; - - minimizer_tuple const* next_begin() { - if (m_bucket_begin == m_end) return m_end; - minimizer_tuple const* begin = m_bucket_begin; - uint64_t prev_minimizer = begin->minimizer; - while (++begin != m_end and begin->minimizer == prev_minimizer) {} - return begin; - } -}; - /* Streaming forward iterator over a sorted minimizers tmp file that yields each distinct `minimizer` value exactly once (i.e., one value - per bucket). Equivalent to `minimizers_tuples_iterator` over an mmap'd - buffer, but built on top of `buffered_record_stream` + per bucket), built on top of `buffered_record_stream` so RAM usage is constant. Copyable: pthash's `build_in_external_memory` takes the iterator by @@ -215,9 +119,8 @@ struct streaming_minimizers_iterator { /* Streaming reader over a minimizers tmp file. Reads minimizer_tuple records via std::ifstream (no mmap), and groups consecutive tuples by - minimizer into "buckets" — exactly as `minimizers_tuples_iterator` does - over an mmap'd buffer, but with bounded RAM (~ one bucket at a time - plus one record of lookahead). + minimizer into "buckets" with bounded RAM (~ one bucket at a time plus + one record of lookahead). The caller passes a vector to receive the bucket's tuples; for typical inputs this peaks at max_bucket_size * sizeof(minimizer_tuple). From 1d765419af20bca71547bb8f0a77d12133555a12 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 6 May 2026 19:02:16 +0200 Subject: [PATCH 27/32] minor --- include/constants.hpp | 4 ---- include/offsets.hpp | 6 ++++-- include/util.hpp | 27 ++++++++++++--------------- src/builder/build.cpp | 8 +++----- 4 files changed, 19 insertions(+), 26 deletions(-) diff --git a/include/constants.hpp b/include/constants.hpp index a020a6a..ec779b6 100644 --- a/include/constants.hpp +++ b/include/constants.hpp @@ -4,10 +4,6 @@ namespace sshash::constants { constexpr uint64_t invalid_uint64 = uint64_t(-1); constexpr uint64_t default_ram_limit_in_GiB = 8; -/* Floor on --ram-limit. Below this the build's streaming buffers + pthash's - internal working memory can't usefully be made to fit; rather than degrade - further at very tight budgets, we clamp `-g` to at least this value - (modest by today's desktop standards). */ constexpr uint64_t min_ram_limit_in_GiB = 4; constexpr uint64_t seed = 1; diff --git a/include/offsets.hpp b/include/offsets.hpp index b592e7b..6307471 100644 --- a/include/offsets.hpp +++ b/include/offsets.hpp @@ -5,7 +5,8 @@ namespace sshash { -template struct disk_backed_offsets_builder; +template +struct disk_backed_offsets_builder; struct num_bits { num_bits() : per_absolute_offset(0), per_relative_offset(0), per_string_id(0) {} @@ -106,7 +107,8 @@ struct offsets // /* Allow disk_backed_offsets_builder to populate m_seq directly via a streaming forward iterator (mirroring what `Seq`'s nested builder does, but with on-disk values). */ - template friend struct disk_backed_offsets_builder; + template + friend struct disk_backed_offsets_builder; protected: Seq m_seq; diff --git a/include/util.hpp b/include/util.hpp index e45574f..c29e748 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -221,20 +221,18 @@ static inline uint64_t get_seed_for_hash_function(build_configuration const& bui to capping threads — in that case we emit a warning naming the MPHF so the user knows the requested -t couldn't be honored. */ -static inline void configure_mphf_threads_and_partition( - pthash::build_configuration& mphf, // - uint64_t requested_num_threads, // - uint64_t ram_limit_in_GiB, // - bool verbose, // - char const* mphf_name) // +static inline void configure_mphf_threads_and_partition(pthash::build_configuration& mphf, // + uint64_t requested_num_threads, // + uint64_t ram_limit_in_GiB, // + bool verbose, // + char const* mphf_name) // { - constexpr uint64_t per_key_bytes = 32; // pairs_t entry + sort slack + constexpr uint64_t per_key_bytes = 32; // pairs_t entry + sort slack constexpr uint64_t min_avg_partition_size = uint64_t(100) * 1000; const uint64_t default_avg = constants::avg_partition_size; const uint64_t pthash_ram = (ram_limit_in_GiB * essentials::GiB) / 2; - const uint64_t per_thread = - pthash_ram / std::max(1, requested_num_threads); + const uint64_t per_thread = pthash_ram / std::max(1, requested_num_threads); const uint64_t avg_for_thread_budget = per_thread / per_key_bytes; if (avg_for_thread_budget >= default_avg) { @@ -249,14 +247,13 @@ static inline void configure_mphf_threads_and_partition( } else { /* Pathological: not enough RAM per thread even at the floor. Cap threads so the floor fits. */ - const uint64_t max_threads = std::max( - 1, pthash_ram / (per_key_bytes * min_avg_partition_size)); + const uint64_t max_threads = + std::max(1, pthash_ram / (per_key_bytes * min_avg_partition_size)); if (verbose) { std::cout << " --> WARNING: not enough RAM per thread for " << mphf_name - << " (--ram-limit=" << ram_limit_in_GiB << " GiB, " - << requested_num_threads << " requested threads): capping to " - << max_threads << " threads at min partition size " - << min_avg_partition_size << std::endl; + << " (--ram-limit=" << ram_limit_in_GiB << " GiB, " << requested_num_threads + << " requested threads): capping to " << max_threads + << " threads at min partition size " << min_avg_partition_size << std::endl; } mphf.num_threads = max_threads; mphf.avg_partition_size = min_avg_partition_size; diff --git a/src/builder/build.cpp b/src/builder/build.cpp index 0f6210b..812a340 100644 --- a/src/builder/build.cpp +++ b/src/builder/build.cpp @@ -9,12 +9,14 @@ namespace sshash { namespace { inline void validate_and_normalize_build_config(build_configuration& bc, uint64_t max_k, - uint64_t max_m) { + uint64_t max_m) // +{ if (bc.k == 0) throw std::runtime_error("k must be > 0"); if (bc.k > max_k) { throw std::runtime_error("k must be less <= " + std::to_string(max_k) + " but got k = " + std::to_string(bc.k)); } + if (bc.m == 0) throw std::runtime_error("m must be > 0"); if (bc.m > max_m) { throw std::runtime_error("m must be less <= " + std::to_string(max_m) + @@ -22,10 +24,6 @@ inline void validate_and_normalize_build_config(build_configuration& bc, uint64_ } if (bc.m > bc.k) throw std::runtime_error("m must be <= k"); - /* Clamp --ram-limit to the floor. Below this, the streaming buffers - plus pthash's internal working memory can't usefully be made to - fit; rather than try to squeeze further we treat the floor as the - effective budget. */ if (bc.ram_limit_in_GiB < constants::min_ram_limit_in_GiB) { if (bc.verbose) { std::cout << " --> NOTE: --ram-limit raised from " << bc.ram_limit_in_GiB From 033d59f85b43b6e5165b30d547c5a3af6e355c3d Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 6 May 2026 17:18:49 +0000 Subject: [PATCH 28/32] always build via streaming-save; mmap the saved file for --check The in-RAM build path now duplicated work: it materialized all spilled components back into RAM after step 7 just so --check could query the in-memory dict. Drop it. Make streaming-save the only build path: * dictionary::build now takes output_filename; the streaming variant is gone and there is no longer a query-ready in-memory build. * dictionary_builder::build_streaming_save -> build, and the materialize_spilled_into / materialize_compact_vector_from_file helpers are removed (~70 LOC). * finalize_stats no longer branches on whether `d` is populated. * tools/build.cpp always streams to a file. If the user passed -o, that path is used; otherwise a tmp file under tmp_dirname is written and removed on exit. --check loads the saved file via open_dictionary with mmap=true and runs the existing correctness checks against that. --- include/builder/dictionary_builder.hpp | 119 ++++--------------------- include/dictionary.hpp | 19 ++-- src/builder/build.cpp | 18 +--- tools/build.cpp | 55 ++++++------ 4 files changed, 59 insertions(+), 152 deletions(-) diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp index e8cf02c..0439d7c 100644 --- a/include/builder/dictionary_builder.hpp +++ b/include/builder/dictionary_builder.hpp @@ -17,21 +17,9 @@ namespace sshash { /* - Helper: load a serialized bits::compact_vector back from a tmp file - into the given in-RAM compact_vector. Used by the materializing build - flow (after step 7) so that --check / queries can run. -*/ -inline void materialize_compact_vector_from_file(bits::compact_vector& cv, - std::string const& filename) { - essentials::loader loader(filename.c_str()); - loader.visit(cv); -} - -/* - Tmp file paths for the compact_vectors that step 7 spills to disk. - Populated by build_sparse_and_skew_index; consumed by step 8 (either - materialized back into RAM for `build()`, or injected into the output - by `build_streaming_save()`). + Tmp file paths for the compact_vectors and MPHFs that step 7 spills + to disk. Populated by build_sparse_and_skew_index and injected into + the output by step 8 (stream-save). */ struct spilled_components { std::string control_codewords_path; @@ -72,32 +60,14 @@ struct dictionary_builder // } /* - Build a query-ready dictionary in `d`. After this returns, all - spilled components and `d.m_spss.strings` are materialized in RAM - (peak briefly equals the index size). Use this when the caller - needs to query `d` post-build (e.g., `--check`). - */ - void build(dictionary& d, std::string const& filename) { - run_steps_1_through_7(d, filename); - do_step("step 8 (materialize spilled components to RAM)", [&]() { - materialize_spilled_into(d); - strings_builder.load_into(d.m_spss.strings); - strings_builder.remove_file(); - spilled.clear_files(); - }); - finalize_stats(d); - } - - /* - Build the dictionary and stream-save it to `output_filename` without - ever materializing the spilled components or `strings` in RAM. - After this returns, `d` is *not* query-ready. Use this when the - caller only needs the on-disk index file and wants to keep peak RAM - bounded by the build phase. + Build the dictionary and stream-save it to `output_filename` + without ever materializing the spilled components or `strings` + in RAM. After this returns, `d` is *not* query-ready; reload the + saved file via `essentials::load` / `essentials::mmap` to query. */ - void build_streaming_save(dictionary& d, // - std::string const& filename, // - std::string const& output_filename) // + void build(dictionary& d, // + std::string const& filename, // + std::string const& output_filename) // { run_steps_1_through_7(d, filename); do_step("step 8 (stream-save dictionary to disk)", [&]() { @@ -166,49 +136,6 @@ struct dictionary_builder // uint64_t total_time_musec; private: - /* Load each spilled compact_vector tmp file back into the corresponding - in-RAM compact_vector inside `d`. Used by the materializing build - flow so queries can run against `d` (e.g., during --check). */ - void materialize_spilled_into(dictionary& d) { - if (!spilled.control_codewords_path.empty()) { - materialize_compact_vector_from_file(d.m_ssi.codewords.control_codewords, - spilled.control_codewords_path); - } - if (!spilled.mid_load_buckets_path.empty()) { - materialize_compact_vector_from_file(d.m_ssi.mid_load_buckets, - spilled.mid_load_buckets_path); - } - if (!spilled.heavy_load_buckets_path.empty()) { - materialize_compact_vector_from_file(d.m_ssi.ski.heavy_load_buckets, - spilled.heavy_load_buckets_path); - } - /* Reload the spilled MPHFs back into RAM so queries work. */ - if (!spilled.codewords_mphf_path.empty()) { - essentials::loader loader(spilled.codewords_mphf_path.c_str()); - loader.visit(d.m_ssi.codewords.mphf); - } - const std::size_t num_part = - std::max(spilled.skew_positions_paths.size(), spilled.skew_mphfs_paths.size()); - if (num_part > 0) { - std::vector positions_vec(num_part); - std::vector> mphfs_vec(num_part); - for (std::size_t i = 0; i != spilled.skew_positions_paths.size(); ++i) { - if (!spilled.skew_positions_paths[i].empty()) { - materialize_compact_vector_from_file(positions_vec[i], - spilled.skew_positions_paths[i]); - } - } - for (std::size_t i = 0; i != spilled.skew_mphfs_paths.size(); ++i) { - if (!spilled.skew_mphfs_paths[i].empty()) { - essentials::loader loader(spilled.skew_mphfs_paths[i].c_str()); - loader.visit(mphfs_vec[i]); - } - } - d.m_ssi.ski.positions = std::move(positions_vec); - d.m_ssi.ski.mphfs = std::move(mphfs_vec); - } - } - void run_steps_1_through_7(dictionary& d, std::string const& filename) { d.m_k = build_config.k; d.m_m = build_config.m; @@ -274,28 +201,18 @@ struct dictionary_builder // }); } - void finalize_stats(dictionary& d, std::string const& saved_path = "") { - /* For the materialize-to-RAM flow `d` is fully populated and we - can call `d.print_space_breakdown()` / `d.num_bits()` directly. - For the streaming-save flow `d`'s spilled components are empty - placeholders, so we read the on-disk index file's size for the - total via `std::filesystem::file_size` — direct OS stat, no - recomputation. */ - const bool d_is_populated = d.m_spss.strings.num_bits() > 0; + void finalize_stats(dictionary& d, std::string const& saved_path) { + /* `d`'s spilled components are empty placeholders post stream-save, + so read the on-disk index file's size via std::filesystem::file_size + rather than recomputing from `d`. */ uint64_t num_bytes = 0; - if (d_is_populated) { - num_bytes = (d.num_bits() + 7) / 8; - } else if (!saved_path.empty()) { - std::error_code ec; - const auto sz = std::filesystem::file_size(saved_path, ec); - if (!ec) num_bytes = static_cast(sz); - } + std::error_code ec; + const auto sz = std::filesystem::file_size(saved_path, ec); + if (!ec) num_bytes = static_cast(sz); if (build_config.verbose) { print_time(total_time_musec, "total time"); - if (d_is_populated) { - d.print_space_breakdown(); - } else if (num_bytes > 0) { + if (num_bytes > 0) { std::cout << "total index size: " << num_bytes << " [B] -- " << essentials::convert(num_bytes, essentials::MB) << " [MB]\n"; std::cout << " total: " diff --git a/include/dictionary.hpp b/include/dictionary.hpp index 7790efb..e553b6e 100644 --- a/include/dictionary.hpp +++ b/include/dictionary.hpp @@ -25,19 +25,16 @@ struct dictionary // , m_m(0) , m_canonical(false) {} - /* Build from input file. After this returns, `*this` is query-ready. */ - void build(std::string const& input_filename, build_configuration const& build_config); - /* - Build from input file and stream-save the resulting dictionary to - `output_filename`. The strings bit-vector is never materialized in - RAM during construction, so peak RAM is bounded by the build phase - only. After this returns, `*this` is *not* query-ready - (`m_spss.strings` is empty); reload via `essentials::load` to query. + Build from input file, streaming the resulting dictionary to + `output_filename` as it goes. The strings bit-vector and the + sparse/skew components are never fully materialized in RAM during + construction, so peak RAM is bounded by the build phase only. + After this returns, `*this` is *not* query-ready; load the saved + index back via `essentials::load` / `essentials::mmap` to query. */ - void build_streaming_save(std::string const& input_filename, - build_configuration const& build_config, - std::string const& output_filename); + void build(std::string const& input_filename, build_configuration const& build_config, + std::string const& output_filename); essentials::version_number vnum() const { return m_vnum; } uint64_t num_kmers() const { return m_num_kmers; } diff --git a/src/builder/build.cpp b/src/builder/build.cpp index 812a340..e4ee59a 100644 --- a/src/builder/build.cpp +++ b/src/builder/build.cpp @@ -37,24 +37,14 @@ inline void validate_and_normalize_build_config(build_configuration& bc, uint64_ } // namespace template -void dictionary::build(std::string const& filename, - build_configuration const& build_config) // +void dictionary::build(std::string const& input_filename, + build_configuration const& build_config, + std::string const& output_filename) // { build_configuration bc = build_config; validate_and_normalize_build_config(bc, Kmer::max_k, Kmer::max_m); dictionary_builder builder(bc); - builder.build(*this, filename); -} - -template -void dictionary::build_streaming_save(std::string const& input_filename, - build_configuration const& build_config, - std::string const& output_filename) // -{ - build_configuration bc = build_config; - validate_and_normalize_build_config(bc, Kmer::max_k, Kmer::max_m); - dictionary_builder builder(bc); - builder.build_streaming_save(*this, input_filename, output_filename); + builder.build(*this, input_filename, output_filename); } } // namespace sshash diff --git a/tools/build.cpp b/tools/build.cpp index fbc7d1d..e6e3f67 100644 --- a/tools/build.cpp +++ b/tools/build.cpp @@ -76,36 +76,39 @@ int build(int argc, char** argv) { bool check = parser.get("check"); bool has_output = parser.parsed("output_filename"); - dictionary_type dict; - - if (has_output && !check) { - /* Streaming-save path: keeps peak RAM bounded by the build phase - (the strings bit-vector is never fully in RAM). After this returns - `dict` is not query-ready; reload from disk to query. */ - auto output_filename = parser.get("output_filename"); - essentials::logger("building data structure (streaming save)..."); - dict.build_streaming_save(input_filename, build_config, output_filename); - essentials::logger("DONE"); + /* Always build via the streaming-save path: peak RAM is bounded by + the build phase only. If the caller didn't pass -o, write to a + tmp file in `tmp_dirname` and delete it after the build (or after + the --check verification). */ + std::string output_filename; + if (has_output) { + output_filename = parser.get("output_filename"); } else { - essentials::logger("building data structure..."); - dict.build(input_filename, build_config); + std::stringstream ss; + ss << build_config.tmp_dirname << "/sshash.tmp.run_" + << pthash::clock_type::now().time_since_epoch().count() << ".index.bin"; + output_filename = ss.str(); + } - if (check) { - check_correctness_lookup_access(dict, input_filename); - check_correctness_navigational_kmer_query(dict, input_filename); - check_correctness_navigational_string_query(dict); - if (build_config.weighted) check_correctness_weights(dict, input_filename); - check_correctness_kmer_iterator(dict); - check_correctness_string_iterator(dict); - } + { + dictionary_type dict; + essentials::logger("building data structure..."); + dict.build(input_filename, build_config, output_filename); + essentials::logger("DONE"); + } - if (has_output) { - auto output_filename = parser.get("output_filename"); - essentials::logger("saving data structure to disk..."); - essentials::save(dict, output_filename.c_str()); - essentials::logger("DONE"); - } + if (check) { + dictionary_type dict; + open_dictionary(dict, output_filename, /*mmap=*/true, build_config.verbose); + check_correctness_lookup_access(dict, input_filename); + check_correctness_navigational_kmer_query(dict, input_filename); + check_correctness_navigational_string_query(dict); + if (build_config.weighted) check_correctness_weights(dict, input_filename); + check_correctness_kmer_iterator(dict); + check_correctness_string_iterator(dict); } + if (!has_output) std::remove(output_filename.c_str()); + return 0; } From 8abeb704f02a5d3489e2eb04dd688d110fa07248 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 6 May 2026 19:24:32 +0200 Subject: [PATCH 29/32] minor --- include/builder/buffered_record_stream.hpp | 7 ++----- include/builder/dictionary_builder.hpp | 9 ++++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/include/builder/buffered_record_stream.hpp b/include/builder/buffered_record_stream.hpp index 382505c..28c72de 100644 --- a/include/builder/buffered_record_stream.hpp +++ b/include/builder/buffered_record_stream.hpp @@ -44,14 +44,11 @@ struct buffered_record_stream { /* Open `filename` for forward reading; optionally seek to byte `start_byte` before priming the read window. */ - void open(std::string const& filename, - uint64_t buffer_records = default_buffer_records, + void open(std::string const& filename, uint64_t buffer_records = default_buffer_records, std::streamoff start_byte = 0) { m_buf.resize(std::max(1, buffer_records)); m_in.open(filename, std::ifstream::binary); - if (!m_in.is_open()) { - throw std::runtime_error("cannot open file '" + filename + "'"); - } + if (!m_in.is_open()) { throw std::runtime_error("cannot open file '" + filename + "'"); } if (start_byte != 0) m_in.seekg(start_byte, std::ios::beg); m_pos = 0; m_size = 0; diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp index 0439d7c..97de32e 100644 --- a/include/builder/dictionary_builder.hpp +++ b/include/builder/dictionary_builder.hpp @@ -65,9 +65,9 @@ struct dictionary_builder // in RAM. After this returns, `d` is *not* query-ready; reload the saved file via `essentials::load` / `essentials::mmap` to query. */ - void build(dictionary& d, // - std::string const& filename, // - std::string const& output_filename) // + void build(dictionary& d, // + std::string const& filename, // + std::string const& output_filename) // { run_steps_1_through_7(d, filename); do_step("step 8 (stream-save dictionary to disk)", [&]() { @@ -215,8 +215,7 @@ struct dictionary_builder // if (num_bytes > 0) { std::cout << "total index size: " << num_bytes << " [B] -- " << essentials::convert(num_bytes, essentials::MB) << " [MB]\n"; - std::cout << " total: " - << (num_kmers > 0 ? (8.0 * num_bytes) / num_kmers : 0.0) + std::cout << " total: " << (num_kmers > 0 ? (8.0 * num_bytes) / num_kmers : 0.0) << " [bits/kmer]" << std::endl; } } From 5aff9f7c4e3a8aa8e118799078ab22d29c608a30 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 6 May 2026 17:30:27 +0000 Subject: [PATCH 30/32] docs: add build-algorithm.md describing the streaming build pipeline Walks through the eight build steps, what each one produces, and where each intermediate lives between steps. Documents the two mechanisms that together cap peak RSS at --ram-limit: * every input-size-scaling intermediate is spilled to disk, * every working buffer is sized as a fixed fraction of ram_limit, with the fractions tabulated. --- build-algorithm.md | 226 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 build-algorithm.md diff --git a/build-algorithm.md b/build-algorithm.md new file mode 100644 index 0000000..992e2ff --- /dev/null +++ b/build-algorithm.md @@ -0,0 +1,226 @@ +# SSHash build algorithm + +This note describes how `sshash build` constructs a dictionary while keeping +peak resident memory bounded by the user-supplied `--ram-limit` (in GiB). + +The design has two ideas, applied uniformly: + +1. **Spill, don't accumulate.** Every intermediate that grows with the input + size is written to a tmp file under `--tmp-dirname` rather than held in a + `std::vector` / bit-vector in RAM. Producers append through a small write + buffer; consumers re-read through a small read buffer + (`buffered_record_stream`). +2. **Cap working buffers at a fraction of `--ram-limit`.** Buffers that live + only inside one step are sized as `ram_limit_in_GiB · GiB / N` (with `N` + typically 2 or 8). The constants are picked so that even when several + buffers are alive at the same time across overlapping steps, their sum + stays under the user budget while heap fragmentation across step + transitions is absorbed. + +The build never materializes the final index in RAM. Instead, step 8 +streams it directly to the user-supplied output file (or a tmp file, deleted +on exit, when the user did not pass `-o`). To run `--check`, `tools/build.cpp` +mmaps the saved file and runs the correctness queries against the mmap'd +dictionary. + +--- + +## Pipeline overview + +The orchestration is in `include/builder/dictionary_builder.hpp` +(`run_steps_1_through_7` + `build`). Per-step details are in +`src/builder/{encode_strings,compute_minimizer_tuples,build_sparse_and_skew_index}.cpp`. + +| Step | What it produces | Where it lives between steps | +|------|------------------|------------------------------| +| 1 | Encoded `strings` bit-vector + `strings_offsets` | tmp files (`disk_backed_strings`, `disk_backed_offsets_builder`) | +| 1.1 | Compressed weights (only if `--weighted`) | `weights::builder` (in-RAM, bounded by run-length structure) | +| 2 | Per-thread sorted runs of `minimizer_tuple` | tmp files, one per flushed buffer | +| 3 | Single sorted run of all `minimizer_tuple`s | tmp file (k-way external merge) | +| 4 | Minimizers MPHF F | tmp file (spilled at end of step 5) | +| 5 | Minimizer values replaced by F(minimizer); buffers re-flushed in F-order | new sorted runs, tmp files | +| 6 | Single sorted run keyed by F(minimizer) | tmp file | +| 7.1 | Sparse-index components (`control_codewords`, `mid_load_buckets`) | tmp files | +| 7.2 | Skew-index components (`heavy_load_buckets`, per-partition MPHFs and `positions`) | tmp files | +| 8 | Final on-disk index file | streamed to output, tmp files removed | + +After step 8 the dictionary object `d` is **not** query-ready: the spilled +components were copied into the output file but never read back into `d`. +`finalize_stats` reports `index_size_in_bytes` via `std::filesystem::file_size` +on the saved path. + +--- + +## Step 1 — encode strings (`encode_strings.cpp`) + +Iterates the input FASTA, producing the 2-bit-packed `strings` bit-vector +and the `strings_offsets` array (one offset per sequence + a sentinel). +Both go through disk-backed builders: + +- **`disk_backed_strings`**: appends 2-bit characters into a small in-RAM + word buffer; flushes the buffer to a tmp file when full. +- **`disk_backed_offsets_builder`**: appends one `uint64_t` offset + per sequence into a small write buffer; flushes to a tmp file. + +In-RAM footprint of step 1 is `O(buffer)` regardless of input size. + +## Step 1.1 — weights (optional) + +Only runs with `--weighted`. The weights builder uses run-length encoding: +its in-RAM size is proportional to the number of distinct weights, not to +the number of k-mers. + +## Step 2 — compute minimizer tuples (`compute_minimizer_tuples.cpp`) + +Each thread streams its assigned slice of the input via the disk-backed +strings/offsets readers and emits `minimizer_tuple` records into a private +in-RAM buffer: + +```cpp +buffer_size = (ram_limit · GiB) / (2 · sizeof(minimizer_tuple) · num_threads) +``` + +When the buffer fills, the thread sorts it in parallel and flushes a sorted +run to a tmp file (`minimizers_tuples::sort_and_flush`). The factor of 2 +in the denominator leaves headroom for `std::sort`'s allocations and +inter-thread contention; the per-thread split makes the total in-RAM tuple +buffer ≈ `ram_limit / 2`. + +## Step 3 — k-way external merge (`minimizers_tuples::merge`) + +The N tmp files from step 2 are merged into a single sorted run via a +**winner-tree-based external-merge iterator** (`file_merging_iterator`). +Each input file is read through a `buffered_record_stream` +with `default_buffer_records = 4096` records, so the total in-RAM merge +state is `N · 4096 · sizeof(minimizer_tuple)` ≈ tens of MB even for very +many runs. The output is written through a small `std::ofstream` buffer. + +When N == 1 the merge degenerates to a rename + a single streaming scan to +collect bucket statistics; same RAM bound. + +## Step 4 — build minimizers MPHF + +Builds an external-memory partitioned PHF over distinct minimizers, using +pthash's `build_in_external_memory`. The minimizers are streamed from the +sorted run via `streaming_minimizers_iterator` (one buffered ifstream), +and pthash spills its own working hashes under `tmp_dirname` capped by +`mphf_build_config.ram = ram_limit / 2`. + +## Step 5 — replace minimizer values with F(minimizer) + +The merged file is re-read in fixed-size blocks; each block is hashed in +parallel and re-flushed as a new sorted run. Two RAM caps are combined: + +```cpp +RAM_available = ram_limit · GiB − sizeof(F) − offsets_builder.num_bytes() +buffer_unbounded = RAM_available / (3 · sizeof(minimizer_tuple)) // 3× = read+sort scratch+write +buffer_cap = (ram_limit · GiB / 8) / sizeof(minimizer_tuple) +buffer_size = min(buffer_unbounded, buffer_cap) +``` + +The `/ 8` cap exists because step 5 leaves heap pages dirtied that linger +into later steps' allocations; capping at one-eighth of the budget keeps +the cumulative RSS under `ram_limit` when steps 6/7 start allocating. + +After step 5, the minimizers MPHF F is **spilled to disk** and the in-RAM +copy is freed: subsequent steps only ever use F(minimizer) values, not F +itself. + +## Step 6 — re-merge in F-order + +Same machinery as step 3, applied to the new sorted runs from step 5. + +## Step 7.1 — sparse index (`build_sparse_and_skew_index.cpp`) + +Constructs `control_codewords` and `mid_load_buckets`. Both are produced as +on-disk `bits::compact_vector` files via `streaming_compact_vector_writer`, +so neither is ever materialized in RAM. + +## Step 7.2 — skew index + +The most RAM-sensitive step; it has three internal phases, all +disk-backed: + +- **Phase B (k-mer extraction requests).** Heavy-bucket entries become + `kmer_extraction_request` records. They are external-sorted by + `starting_pos` so that k-mer extraction reduces to a single forward + scan over `strings`. The request buffer is capped at + `ram_limit / 8 / sizeof(kmer_extraction_request)`; flushed runs are + merged with `file_merging_iterator`. +- **Per-partition kmer files.** While walking `strings` in request-sorted + order, each extracted k-mer is written to its partition's tmp file via + a buffered writer; this file is the input to the partition's MPHF. +- **Phase C (per-partition MPHF + `positions`).** For each skew partition: + 1. Build the partition MPHF with pthash external-memory (`ram = ram_limit / 2`, + iterator: `skew_partition_kmer_iterator` over the partition's tmp file). + 2. Stream-read the partition file again, emit `(F(kmer), pos_in_bucket)` + tuples; external-sort them in `ram_limit / 8`-sized buffers and merge. + 3. Pack the sorted tuples into the partition's `positions` + compact_vector via `streaming_compact_vector_writer`. + + Only the freshly-built MPHF for the *current* partition lives in RAM + during phase C; once spilled (`essentials::save`), it is freed before the + next partition starts. `positions` is fully on-disk. + +## Step 8 — stream-save (`include/builder/streaming_save.hpp`) + +The dictionary `d` is walked by `essentials::saver`, but every spilled +component is intercepted via an **address+type-keyed substitution map** +(`typed_address_sub`): when the saver visits a registered (address, type) +pair, it appends the bytes of the corresponding tmp file straight into the +output stream instead of reading from `d`. The strings bit-vector goes +through the same mechanism via `disk_backed_strings`. + +Concretely, the registered substitutions are: + +| Component | Source tmp file | +|-----------------------------------|----------------------------------------| +| `m_ssi.codewords.control_codewords` | step 7.1 | +| `m_ssi.mid_load_buckets` | step 7.1 | +| `m_ssi.ski.heavy_load_buckets` | step 7.2 phase B | +| `m_ssi.codewords.mphf` | step 5 spill | +| `m_ssi.ski.positions[i]` | step 7.2 phase C, per partition | +| `m_ssi.ski.mphfs[i]` | step 7.2 phase C, per partition | +| `m_spss.strings` | step 1 (`disk_backed_strings`) | + +Because the substitutions are by `(address, type)` pair, a struct's address +coinciding with its first member's address does not cause confusion. + +After step 8 returns, the tmp files are removed and `finalize_stats` reads +the saved file's size with `std::filesystem::file_size`. + +--- + +## How the RAM cap is enforced — summary + +The on-disk index size grows with `num_kmers`. The build's **resident** +memory does not, because every component that scales with input size is +either: + +- **always on disk** (`strings`, `strings_offsets`, all sorted minimizer + runs, the merged minimizers file, the sparse-index compact_vectors, the + skew-index per-partition kmer/positions files, the codewords MPHF and + per-partition MPHFs), or +- **bounded by a working buffer** sized as a fraction of `ram_limit`: + + | Buffer | Cap | + |----------------------------------------|--------------------------| + | Step 2 per-thread minimizer buffer | `ram_limit / 2 / num_threads` | + | Step 5 hashing buffer | `min(ram/8, RAM_available/3)` | + | Step 7.2 kmer-request external sort | `ram_limit / 8` | + | Step 7.2 phase-C `position_tuple` sort | `ram_limit / 8` | + | pthash external-memory builds | `ram_limit / 2` (its own `--ram`) | + | Every disk-backed reader/writer | `default_buffer_records ≈ 32 KiB` | + | Every external merge front (per run) | `4096 · sizeof(T)` | + +The fractions (`/2` for the dominant per-step buffer, `/8` for buffers +that span step boundaries) are chosen so that overlapping allocations and +heap fragmentation between steps stay under `ram_limit` in practice. There +is a hard floor of `min_ram_limit_in_GiB` (enforced in +`validate_and_normalize_build_config`) below which step 4's MPHF builder +no longer has enough room to make progress. + +The result: peak RSS during the build is governed by `--ram-limit`, not by +the input size or by the on-disk index size, and the saved index is +identical (byte-for-byte) to one written by an in-RAM builder followed by +`essentials::save`. From 730f0ad3c1fb11ca86ea6e27b14c3f60d3c36a6d Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 6 May 2026 17:35:38 +0000 Subject: [PATCH 31/32] docs(build-algorithm): use real CLI flag names (-g, -d) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The doc referred to --ram-limit and --tmp-dirname; the actual short flags exposed by sshash build are -g (RAM limit in GiB) and -d (tmp dir). The pthash "ram" reference was a programmatic config field, not a CLI flag — clarified. --- build-algorithm.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/build-algorithm.md b/build-algorithm.md index 992e2ff..c437166 100644 --- a/build-algorithm.md +++ b/build-algorithm.md @@ -1,16 +1,16 @@ # SSHash build algorithm This note describes how `sshash build` constructs a dictionary while keeping -peak resident memory bounded by the user-supplied `--ram-limit` (in GiB). +peak resident memory bounded by the user-supplied `-g` (in GiB). The design has two ideas, applied uniformly: 1. **Spill, don't accumulate.** Every intermediate that grows with the input - size is written to a tmp file under `--tmp-dirname` rather than held in a + size is written to a tmp file under `-d` (tmp dir) rather than held in a `std::vector` / bit-vector in RAM. Producers append through a small write buffer; consumers re-read through a small read buffer (`buffered_record_stream`). -2. **Cap working buffers at a fraction of `--ram-limit`.** Buffers that live +2. **Cap working buffers at a fraction of `-g`.** Buffers that live only inside one step are sized as `ram_limit_in_GiB · GiB / N` (with `N` typically 2 or 8). The constants are picked so that even when several buffers are alive at the same time across overlapping steps, their sum @@ -209,7 +209,7 @@ either: | Step 5 hashing buffer | `min(ram/8, RAM_available/3)` | | Step 7.2 kmer-request external sort | `ram_limit / 8` | | Step 7.2 phase-C `position_tuple` sort | `ram_limit / 8` | - | pthash external-memory builds | `ram_limit / 2` (its own `--ram`) | + | pthash external-memory builds | `ram_limit / 2` (its own `ram` field) | | Every disk-backed reader/writer | `default_buffer_records ≈ 32 KiB` | | Every external merge front (per run) | `4096 · sizeof(T)` | @@ -220,7 +220,7 @@ is a hard floor of `min_ram_limit_in_GiB` (enforced in `validate_and_normalize_build_config`) below which step 4's MPHF builder no longer has enough room to make progress. -The result: peak RSS during the build is governed by `--ram-limit`, not by +The result: peak RSS during the build is governed by `-g`, not by the input size or by the on-disk index size, and the saved index is identical (byte-for-byte) to one written by an in-RAM builder followed by `essentials::save`. From f19ffccab892b132d90b4e7d1647455afd51cc5a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 6 May 2026 17:45:47 +0000 Subject: [PATCH 32/32] docs(build-algorithm): rephrase 'O(buffer)' as 'proportional to the buffer size' --- build-algorithm.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build-algorithm.md b/build-algorithm.md index c437166..7490726 100644 --- a/build-algorithm.md +++ b/build-algorithm.md @@ -62,7 +62,8 @@ Both go through disk-backed builders: - **`disk_backed_offsets_builder`**: appends one `uint64_t` offset per sequence into a small write buffer; flushes to a tmp file. -In-RAM footprint of step 1 is `O(buffer)` regardless of input size. +In-RAM footprint of step 1 is proportional to the buffer size, regardless of +input size. ## Step 1.1 — weights (optional)