From 9007363a6d53b3b4f9a8f6a9c4f48d07d9cb402a Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Fri, 10 Apr 2026 19:04:32 +0530
Subject: [PATCH 01/32] pushed old changes made when travelling in Nepal:
 thinner bucket_type

---
 include/builder/util.hpp                    | 39 ++++++++++-----------
 src/builder/build_sparse_and_skew_index.cpp | 26 ++++++++------
 2 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/include/builder/util.hpp b/include/builder/util.hpp
index 81efc34..31888d3 100644
--- a/include/builder/util.hpp
+++ b/include/builder/util.hpp
@@ -60,22 +60,7 @@ inline std::ostream& operator<<(std::ostream& os, minimizer_tuple const& mt) {
 
 struct bucket_type {
     bucket_type(minimizer_tuple const* begin, minimizer_tuple const* end)
-        : m_begin(begin)
-        , m_end(end)
-        , m_num_super_kmers(std::distance(begin, end))
-        , m_num_minimizer_positions(0)  //
-    {
-        uint64_t prev_pos_in_seq = constants::invalid_uint64;
-        while (begin != end) {
-            uint64_t pos_in_seq = (*begin).pos_in_seq;
-            if (pos_in_seq != prev_pos_in_seq) {
-                ++m_num_minimizer_positions;
-                prev_pos_in_seq = pos_in_seq;
-            }
-            ++begin;
-        }
-        assert(m_num_minimizer_positions <= m_num_super_kmers);
-    }
+        : m_begin(begin), m_end(end) {}
 
     struct iterator {
         iterator(minimizer_tuple const* begin) : m_begin(begin) {}
@@ -103,8 +88,24 @@ struct bucket_type {
         So the method size() returns the number of minimizer
         positions which is <= the number of superkmers.
     */
-    uint64_t num_super_kmers() const { return m_num_super_kmers; }
-    uint64_t size() const { return m_num_minimizer_positions; }
+
+    uint64_t num_super_kmers() const { return std::distance(m_begin, m_end); }
+
+    uint64_t size() const {
+        uint64_t num_minimizer_positions = 0;
+        uint64_t prev_pos_in_seq = constants::invalid_uint64;
+        auto const* begin = m_begin;
+        while (begin != m_end) {
+            uint64_t pos_in_seq = (*begin).pos_in_seq;
+            if (pos_in_seq != prev_pos_in_seq) {
+                ++num_minimizer_positions;
+                prev_pos_in_seq = pos_in_seq;
+            }
+            ++begin;
+        }
+        assert(num_minimizer_positions <= num_super_kmers());
+        return num_minimizer_positions;
+    }
 
     minimizer_tuple const* begin_ptr() const { return m_begin; }
     minimizer_tuple const* end_ptr() const { return m_end; }
@@ -112,8 +113,6 @@ struct bucket_type {
 private:
     minimizer_tuple const* m_begin;
     minimizer_tuple const* m_end;
-    uint64_t m_num_super_kmers;
-    uint64_t m_num_minimizer_positions;
 };
 
 /*
diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp
index 7ed9886..cae852b 100644
--- a/src/builder/build_sparse_and_skew_index.cpp
+++ b/src/builder/build_sparse_and_skew_index.cpp
@@ -25,7 +25,9 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     uint64_t num_minimizer_positions_of_buckets_larger_than_1 = 0;
     uint64_t num_minimizer_positions_of_buckets_in_skew_index = 0;
 
-    // First pass: collect bucket statistics to compute tighter bound
+    /*
+        First pass: collect bucket statistics to compute tighter bound.
+    */
     for (minimizers_tuples_iterator it(input.data(), input.data() + input.size());  //
          it.has_next(); it.next())                                                  //
     {
@@ -51,10 +53,13 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
 
     assert(buckets_stats.num_buckets() == num_minimizers);
 
-    // Calculate bits needed for control codewords encoding:
-    // Encoding format: ((list_id << min_l) | (bucket_size - 2)) << 2 | status_code
-    // We need: 2 bits (status) + min_l bits (bucket_size) + bits for list_id
-    // list_id is bounded by the maximum number of buckets sharing the same size
+    /*
+        Calculate bits needed for control codewords encoding.
+        Encoding format:
+            ((list_id << min_l) | (bucket_size - 2)) << 2 | status_code
+        We need: 2 bits (status) + min_l bits (bucket_size) + bits for list_id.
+        list_id is bounded by the maximum number of buckets sharing the same size.
+    */
     const uint64_t bits_for_list_id =
         std::ceil(std::log2(buckets_stats.max_sparse_buckets_per_size() + 1));
     const uint64_t num_bits_for_control =
@@ -106,7 +111,6 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         const uint64_t bucket_id = it.minimizer();
         auto bucket = it.bucket();
         const uint64_t bucket_size = bucket.size();
-
         if (bucket_size == 1) {
             // Handle size-1 buckets: encode directly into control codewords
             uint64_t prev_pos_in_seq = constants::invalid_uint64;
@@ -271,7 +275,8 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         for (uint64_t i = buckets.size() - num_buckets_in_skew_index; i <= buckets.size(); ++i)  //
         {
             auto const& bucket = buckets[i];
-            while (i == buckets.size() or bucket.size() > upper)  //
+            const uint64_t bucket_size = bucket.size();
+            while (i == buckets.size() or bucket_size > upper)  //
             {
                 if (build_config.verbose) {
                     std::cout << "  partition = " << partition_id
@@ -291,7 +296,7 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
 
             if (i == buckets.size()) break;
 
-            assert(bucket.size() > lower and bucket.size() <= upper);
+            assert(bucket_size > lower and bucket_size <= upper);
             for (auto mt : bucket) {
                 num_kmers_in_partition[partition_id] += mt.num_kmers_in_super_kmer;
             }
@@ -341,7 +346,8 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
              i <= buckets.size(); ++i)  //
         {
             auto const& bucket = buckets[i];
-            while (i == buckets.size() or bucket.size() > upper)  //
+            const uint64_t bucket_size = bucket.size();
+            while (i == buckets.size() or bucket_size > upper)  //
             {
                 if (build_config.verbose) {
                     std::cout << "  lower = " << lower << "; upper = " << upper
@@ -441,7 +447,7 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
 
             if (i == buckets.size()) break;
 
-            assert(bucket.size() > lower and bucket.size() <= upper);
+            assert(bucket_size > lower and bucket_size <= upper);
             uint64_t pos_in_bucket = -1;
             uint64_t prev_pos_in_seq = constants::invalid_uint64;
             for (auto mt : bucket)  //

From 54aa00b4f18f3a0f058accbbb3741cead4ffa203 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 20:32:13 +0000
Subject: [PATCH 02/32] fixed issue with minimizers_tuples_iterator

Cherry-picked from 1f0cdd1 on master.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 include/builder/util.hpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/include/builder/util.hpp b/include/builder/util.hpp
index 31888d3..9b9b209 100644
--- a/include/builder/util.hpp
+++ b/include/builder/util.hpp
@@ -145,13 +145,10 @@ struct minimizers_tuples_iterator {
     minimizer_tuple const* m_end;
 
     minimizer_tuple const* next_begin() {
+        if (m_bucket_begin == m_end) return m_end;
         minimizer_tuple const* begin = m_bucket_begin;
-        uint64_t prev_minimizer = (*begin).minimizer;
-        while (begin != m_end) {
-            ++begin;
-            uint64_t curr_minimizer = (*begin).minimizer;
-            if (curr_minimizer != prev_minimizer) break;
-        }
+        uint64_t prev_minimizer = begin->minimizer;
+        while (++begin != m_end and begin->minimizer == prev_minimizer) {}
         return begin;
     }
 };

From 259b7fc4ab1c87a32e6bc631b46fa9afed7cb2f2 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 20:32:26 +0000
Subject: [PATCH 03/32] step 7.2: stream over strings instead of random access

The skew-index build used to do a random read into the strings
bitvector for every super-kmer in every heavy-load bucket
(buckets are visited in size-sorted order, so the resulting
positions are essentially random across all of strings). That
forces strings to be RAM-resident throughout step 7.2.

This commit restructures step 7.2 into three sub-steps:

  (A) walk the heavy-load buckets and emit one
      kmer_extraction_request per super-kmer, externally
      sort+flushed by starting_pos (parallel_sort within a
      bounded RAM buffer, ~1/4 of --ram-limit).

  (B) merge the sorted runs and walk strings in a single
      forward pass; for each request extract the requested
      kmers and append (kmer.bits, pos_in_bucket) to a
      per-partition tmp file. Only kmer.bits is serialized to
      avoid persisting the vptr that uint_kmer_t carries via
      its virtual destructor.

  (C) for each partition, read its tmp file, build the MPHF
      and the positions compact vector. The skew index is
      assembled partition by partition.

The access pattern over strings is now monotonically
non-decreasing, which is the precondition for moving strings
itself out of RAM in a follow-up change. Correctness verified
via `sshash build --check` on salmonella_enterica (m=7),
canonical mode, and salmonella_100 with 4 threads.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 src/builder/build_sparse_and_skew_index.cpp | 419 +++++++++++++-------
 1 file changed, 267 insertions(+), 152 deletions(-)

diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp
index cae852b..1720274 100644
--- a/src/builder/build_sparse_and_skew_index.cpp
+++ b/src/builder/build_sparse_and_skew_index.cpp
@@ -2,6 +2,40 @@
 
 namespace sshash {
 
+/*
+    A request to extract `num_kmers_in_super_kmer` consecutive k-mers from
+    `strings` starting at base position `starting_pos`. After requests are
+    externally sorted by `starting_pos`, k-mer extraction reduces to a single
+    forward scan over `strings`.
+*/
+#pragma pack(push, 4)
+struct kmer_extraction_request {
+    kmer_extraction_request() {}
+    kmer_extraction_request(uint64_t starting_pos, uint32_t partition_id,
+                            uint32_t pos_in_bucket, uint32_t num_kmers_in_super_kmer)
+        : starting_pos(starting_pos)
+        , partition_id(partition_id)
+        , pos_in_bucket(pos_in_bucket)
+        , num_kmers_in_super_kmer(num_kmers_in_super_kmer) {}
+
+    bool operator<(kmer_extraction_request const& o) const {
+        return starting_pos < o.starting_pos;
+    }
+    bool operator>(kmer_extraction_request const& o) const {
+        return starting_pos > o.starting_pos;
+    }
+
+    static kmer_extraction_request max() {
+        return kmer_extraction_request(uint64_t(-1), uint32_t(-1), uint32_t(-1), uint32_t(-1));
+    }
+
+    uint64_t starting_pos;
+    uint32_t partition_id;
+    uint32_t pos_in_bucket;
+    uint32_t num_kmers_in_super_kmer;
+};
+#pragma pack(pop)
+
 template <typename Kmer, typename Offsets>
 void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     dictionary<Kmer, Offsets>& d)  //
@@ -262,58 +296,196 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         return;
     }
 
-    /* step 2. build skew index */
+    /*
+        step 2. build skew index
+
+        We do this in three sub-steps:
+        (A) walk the heavy-load buckets in size-sorted order, decode each
+            super-kmer's absolute starting position in `strings` and emit a
+            `kmer_extraction_request`. Requests are sort+flushed to disk in
+            chunks (external sort by `starting_pos`).
+        (B) merge the sorted runs and walk `strings` in a single forward
+            sequential pass, extracting the requested k-mers. For each k-mer
+            we append `(kmer.bits, pos_in_bucket)` to a per-partition tmp file.
+        (C) for each partition, read its tmp file, build the MPHF, then build
+            the positions compact vector. The skew index is assembled
+            partition by partition.
+
+        Avoiding the random access pattern over `strings` in (B) is the
+        precondition for moving `strings` itself out of RAM in a later step.
+    */
     timer.start();
+
     std::vector<uint64_t> num_kmers_in_partition(num_partitions, 0);
 
+    /* unique run identifier for the tmp files produced by this step */
+    const uint64_t skew_run_id = pthash::clock_type::now().time_since_epoch().count();
+    auto request_run_filename = [&](uint64_t id) {
+        std::stringstream ss;
+        ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id
+           << ".kmer_requests." << id << ".bin";
+        return ss.str();
+    };
+    auto skew_partition_filename = [&](uint64_t pid) {
+        std::stringstream ss;
+        ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id
+           << ".skew_kmers." << pid << ".bin";
+        return ss.str();
+    };
+
+    /* (A) emit kmer-extraction requests, externally sorted by `starting_pos` */
+    std::atomic<uint64_t> num_request_runs{0};
     {
+        const uint64_t request_buffer_capacity = std::max<uint64_t>(
+            uint64_t(1) << 16,
+            (build_config.ram_limit_in_GiB * essentials::GiB) /
+                (4 * sizeof(kmer_extraction_request)));
+
+        std::vector<kmer_extraction_request> request_buffer;
+        request_buffer.reserve(request_buffer_capacity);
+
+        auto flush_request_buffer = [&]() {
+            if (request_buffer.empty()) return;
+            parallel_sort(request_buffer, build_config.num_threads,
+                          [](kmer_extraction_request const& a,
+                             kmer_extraction_request const& b) {
+                              return a.starting_pos < b.starting_pos;
+                          });
+            const uint64_t id = num_request_runs.fetch_add(1);
+            const std::string fn = request_run_filename(id);
+            if (build_config.verbose) {
+                std::cout << "saving to file '" << fn << "'..." << std::endl;
+            }
+            std::ofstream out(fn, std::ofstream::binary);
+            if (!out.is_open()) throw std::runtime_error("cannot open file");
+            out.write(reinterpret_cast<char const*>(request_buffer.data()),
+                      request_buffer.size() * sizeof(kmer_extraction_request));
+            out.close();
+            request_buffer.clear();
+        };
+
         uint64_t partition_id = 0;
         uint64_t lower = min_size;
         uint64_t upper = 2 * lower;
-        uint64_t num_kmers_in_skew_index = 0;
 
-        for (uint64_t i = buckets.size() - num_buckets_in_skew_index; i <= buckets.size(); ++i)  //
+        for (uint64_t i = buckets.size() - num_buckets_in_skew_index; i < buckets.size(); ++i)  //
         {
             auto const& bucket = buckets[i];
             const uint64_t bucket_size = bucket.size();
-            while (i == buckets.size() or bucket_size > upper)  //
+            while (bucket_size > upper)  //
             {
-                if (build_config.verbose) {
-                    std::cout << "  partition = " << partition_id
-                              << ": num kmers in buckets of size > " << lower << " and <= " << upper
-                              << ": " << num_kmers_in_partition[partition_id] << std::endl;
-                }
-
-                num_kmers_in_skew_index += num_kmers_in_partition[partition_id];
-
-                if (i == buckets.size()) break;
-
                 lower = upper;
                 upper = 2 * lower;
                 partition_id += 1;
                 if (partition_id == num_partitions - 1) upper = max_bucket_size;
             }
-
-            if (i == buckets.size()) break;
-
             assert(bucket_size > lower and bucket_size <= upper);
-            for (auto mt : bucket) {
+            assert(partition_id < num_partitions);
+
+            uint32_t pos_in_bucket = uint32_t(-1);
+            uint64_t prev_pos_in_seq = constants::invalid_uint64;
+            for (auto mt : bucket)  //
+            {
                 num_kmers_in_partition[partition_id] += mt.num_kmers_in_super_kmer;
+                if (mt.pos_in_seq != prev_pos_in_seq) {
+                    prev_pos_in_seq = mt.pos_in_seq;
+                    ++pos_in_bucket;
+                }
+                assert(mt.pos_in_seq >= mt.pos_in_kmer);
+                const uint64_t abs_offset =
+                    d.m_spss.strings_offsets.decode(mt.pos_in_seq).absolute_offset;
+                const uint64_t starting_pos = abs_offset - mt.pos_in_kmer;
+                if (request_buffer.size() == request_buffer_capacity) flush_request_buffer();
+                request_buffer.emplace_back(starting_pos,                          //
+                                            uint32_t(partition_id),                //
+                                            pos_in_bucket,                         //
+                                            uint32_t(mt.num_kmers_in_super_kmer)); //
             }
         }
+        flush_request_buffer();
         assert(partition_id == num_partitions - 1);
+    }
+
+    if (build_config.verbose) {
+        uint64_t total_kmers_in_skew = 0;
+        for (uint64_t p = 0; p != num_partitions; ++p) {
+            total_kmers_in_skew += num_kmers_in_partition[p];
+            std::cout << "  partition = " << p
+                      << ": num kmers in partition = " << num_kmers_in_partition[p] << std::endl;
+        }
+        std::cout << "num kmers in skew index = " << total_kmers_in_skew << " ("
+                  << (total_kmers_in_skew * 100.0) / buckets_stats.num_kmers() << "%)" << std::endl;
+    }
 
-        if (build_config.verbose) {
-            std::cout << "num kmers in skew index = " << num_kmers_in_skew_index << " ("
-                      << (num_kmers_in_skew_index * 100.0) / buckets_stats.num_kmers() << "%)"
-                      << std::endl;
+    /* (B) sequential extraction over `strings` -> per-partition kmer tmp files */
+    {
+        struct request_run_names_iterator {
+            request_run_names_iterator(std::string const& tmp_dirname, uint64_t skew_run_id)
+                : i(0), skew_run_id(skew_run_id), tmp_dirname(tmp_dirname) {}
+
+            std::string operator*() const {
+                std::stringstream ss;
+                ss << tmp_dirname << "/sshash.tmp.run_" << skew_run_id
+                   << ".kmer_requests." << i << ".bin";
+                return ss.str();
+            }
+            void operator++() { ++i; }
+
+            uint64_t i;
+            uint64_t skew_run_id;
+            std::string tmp_dirname;
+        };
+
+        request_run_names_iterator names_it(build_config.tmp_dirname, skew_run_id);
+        file_merging_iterator<kmer_extraction_request> merger(names_it, num_request_runs.load());
+
+        std::vector<std::ofstream> partition_writers(num_partitions);
+        for (uint64_t p = 0; p != num_partitions; ++p) {
+            if (num_kmers_in_partition[p] == 0) continue;
+            partition_writers[p].open(skew_partition_filename(p),
+                                      std::ofstream::binary | std::ofstream::trunc);
+            if (!partition_writers[p].is_open()) {
+                throw std::runtime_error("cannot open skew-partition tmp file");
+            }
         }
 
-        assert(num_kmers_in_skew_index == std::accumulate(num_kmers_in_partition.begin(),
-                                                          num_kmers_in_partition.end(),
-                                                          uint64_t(0)));
+        const uint64_t k = build_config.k;
+        const bool canonical = build_config.canonical;
+        kmer_iterator<Kmer, bits::bit_vector> kmer_it(d.m_spss.strings, k);
+
+        while (merger.has_next())  //
+        {
+            const kmer_extraction_request req = *merger;
+            kmer_it.at(Kmer::bits_per_char * req.starting_pos);
+            for (uint32_t i = 0; i != req.num_kmers_in_super_kmer; ++i) {
+                Kmer kmer = kmer_it.get();
+                if (canonical) {
+                    Kmer kmer_rc = kmer;
+                    kmer_rc.reverse_complement_inplace(k);
+                    kmer = std::min(kmer, kmer_rc);
+                }
+                auto& w = partition_writers[req.partition_id];
+                /* write only `kmer.bits` (avoids serializing the vptr that
+                   `uint_kmer_t` carries due to its virtual destructor) */
+                w.write(reinterpret_cast<char const*>(&kmer.bits), sizeof(kmer.bits));
+                w.write(reinterpret_cast<char const*>(&req.pos_in_bucket),
+                        sizeof(req.pos_in_bucket));
+                kmer_it.next();
+            }
+            merger.next();
+        }
+        merger.close();
+
+        for (auto& w : partition_writers) {
+            if (w.is_open()) w.close();
+        }
+
+        for (uint64_t i = 0; i != num_request_runs.load(); ++i) {
+            std::remove(request_run_filename(i).c_str());
+        }
     }
 
+    /* (C) per-partition MPHF + positions build */
     {
         std::vector<kmers_pthash_type<Kmer>> mphfs;
         std::vector<bits::compact_vector> positions;
@@ -329,155 +501,98 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         mphf_build_config.num_threads = build_config.num_threads;
         mphf_build_config.avg_partition_size = constants::avg_partition_size;
 
-        uint64_t partition_id = 0;
         uint64_t lower = min_size;
         uint64_t upper = 2 * lower;
         uint64_t num_bits_per_pos = constants::min_l + 1;
+        if (num_partitions == 1) {
+            upper = max_bucket_size;
+            num_bits_per_pos = log2_max_bucket_size;
+        }
 
-        /* Temporary storage for kmers and positions within a partition. */
-        std::vector<Kmer> kmers;
-        std::vector<uint32_t> positions_in_bucket;
-        bits::compact_vector::builder cvb_positions;
-        kmers.reserve(num_kmers_in_partition[partition_id]);
-        positions_in_bucket.reserve(num_kmers_in_partition[partition_id]);
-        cvb_positions.resize(num_kmers_in_partition[partition_id], num_bits_per_pos);
-
-        for (uint64_t i = buckets.size() - num_buckets_in_skew_index, k = build_config.k;
-             i <= buckets.size(); ++i)  //
+        for (uint64_t partition_id = 0; partition_id != num_partitions; ++partition_id)  //
         {
-            auto const& bucket = buckets[i];
-            const uint64_t bucket_size = bucket.size();
-            while (i == buckets.size() or bucket_size > upper)  //
-            {
-                if (build_config.verbose) {
-                    std::cout << "  lower = " << lower << "; upper = " << upper
-                              << "; num_bits_per_pos = " << num_bits_per_pos
-                              << "; num_kmers_in_partition = " << kmers.size() << std::endl;
-                }
-                assert(num_kmers_in_partition[partition_id] == kmers.size());
-                assert(num_kmers_in_partition[partition_id] == positions_in_bucket.size());
-
-                if (num_kmers_in_partition[partition_id] > 0)  //
-                {
-                    /*******/
-                    // {
-                    //     uint64_t RAM_available_in_bytes = essentials::GiB;
-
-                    //     uint64_t RAM_taken_in_bytes = essentials::vec_bytes(buckets) +
-                    //                                   essentials::vec_bytes(tuples) +
-
-                    //                                   essentials::vec_bytes(kmers) +
-                    //                                   essentials::vec_bytes(positions_in_bucket)
-                    //                                   +
-                    //                                   essentials::vec_bytes(cvb_positions.data())
-                    //                                   +
-
-                    //                                   d.num_bits() / 8;  // current memory
-
-                    //     std::cout << "RAM_taken_in_bytes = " << RAM_taken_in_bytes << std::endl;
-
-                    //     const uint64_t RAM_limit_in_bytes =
-                    //         build_config.ram_limit_in_GiB * essentials::GiB;
-
-                    //     if (RAM_limit_in_bytes > RAM_taken_in_bytes) {
-                    //         RAM_available_in_bytes = std::max<uint64_t>(
-                    //             RAM_limit_in_bytes - RAM_taken_in_bytes, RAM_available_in_bytes);
-                    //     }
-                    //     std::cout << "RAM_available_in_bytes = " << RAM_available_in_bytes
-                    //               << std::endl;
-
-                    //     mphf_build_config.ram = RAM_available_in_bytes / 2;  // at least 0.5 GB
-                    // }
-                    /*******/
-
-                    if (build_config.verbose) {
-                        const uint64_t avg_partition_size =
-                            pthash::compute_avg_partition_size(kmers.size(), mphf_build_config);
-                        const uint64_t num_partitions =
-                            pthash::compute_num_partitions(kmers.size(), avg_partition_size);
-                        assert(num_partitions > 0);
-                        std::cout << "    building MPHF with " << mphf_build_config.num_threads
-                                  << " threads and " << num_partitions
-                                  << " partitions (avg. partition size = " << avg_partition_size
-                                  << ")..." << std::endl;
-                    }
+            const uint64_t n = num_kmers_in_partition[partition_id];
 
-                    auto& F = mphfs[partition_id];
-                    F.build_in_internal_memory(kmers.begin(), kmers.size(), mphf_build_config);
+            if (build_config.verbose) {
+                std::cout << "  lower = " << lower << "; upper = " << upper
+                          << "; num_bits_per_pos = " << num_bits_per_pos
+                          << "; num_kmers_in_partition = " << n << std::endl;
+            }
 
-                    if (build_config.verbose) {
-                        std::cout << "    built mphs[" << partition_id << "] for " << kmers.size()
-                                  << " kmers; bits/key = "
-                                  << static_cast<double>(F.num_bits()) / F.num_keys() << std::endl;
-                    }
+            if (n > 0)  //
+            {
+                std::vector<Kmer> kmers;
+                std::vector<uint32_t> positions_in_bucket;
+                kmers.reserve(n);
+                positions_in_bucket.reserve(n);
 
-                    for (uint64_t i = 0; i != kmers.size(); ++i) {
-                        Kmer kmer = kmers[i];
-                        uint64_t pos = F(kmer);
-                        uint32_t pos_in_bucket = positions_in_bucket[i];
-                        cvb_positions.set(pos, pos_in_bucket);
+                {
+                    const std::string fn = skew_partition_filename(partition_id);
+                    std::ifstream in(fn, std::ifstream::binary);
+                    if (!in.is_open()) {
+                        throw std::runtime_error("cannot open skew-partition tmp file");
                     }
-                    auto& P = positions[partition_id];
-                    cvb_positions.build(P);
-
-                    if (build_config.verbose) {
-                        std::cout << "    built positions[" << partition_id << "] for " << P.size()
-                                  << " kmers; bits/key = " << (P.num_bytes() * 8.0) / P.size()
-                                  << std::endl;
+                    for (uint64_t i = 0; i != n; ++i) {
+                        Kmer kmer;
+                        in.read(reinterpret_cast<char*>(&kmer.bits), sizeof(kmer.bits));
+                        uint32_t pib;
+                        in.read(reinterpret_cast<char*>(&pib), sizeof(pib));
+                        kmers.push_back(kmer);
+                        positions_in_bucket.push_back(pib);
                     }
+                    in.close();
+                    std::remove(fn.c_str());
                 }
 
-                if (i == buckets.size()) break;
+                bits::compact_vector::builder cvb_positions;
+                cvb_positions.resize(n, num_bits_per_pos);
 
-                lower = upper;
-                upper = 2 * lower;
-                num_bits_per_pos += 1;
-                partition_id += 1;
-                if (partition_id == num_partitions - 1) {
-                    upper = max_bucket_size;
-                    num_bits_per_pos = log2_max_bucket_size;
+                if (build_config.verbose) {
+                    const uint64_t avg_partition_size =
+                        pthash::compute_avg_partition_size(kmers.size(), mphf_build_config);
+                    const uint64_t pthash_num_partitions =
+                        pthash::compute_num_partitions(kmers.size(), avg_partition_size);
+                    assert(pthash_num_partitions > 0);
+                    std::cout << "    building MPHF with " << mphf_build_config.num_threads
+                              << " threads and " << pthash_num_partitions
+                              << " partitions (avg. partition size = " << avg_partition_size
+                              << ")..." << std::endl;
                 }
 
-                kmers.clear();
-                positions_in_bucket.clear();
-                kmers.reserve(num_kmers_in_partition[partition_id]);
-                positions_in_bucket.reserve(num_kmers_in_partition[partition_id]);
-                cvb_positions.resize(num_kmers_in_partition[partition_id], num_bits_per_pos);
-            }
+                auto& F = mphfs[partition_id];
+                F.build_in_internal_memory(kmers.begin(), kmers.size(), mphf_build_config);
 
-            if (i == buckets.size()) break;
+                if (build_config.verbose) {
+                    std::cout << "    built mphs[" << partition_id << "] for " << kmers.size()
+                              << " kmers; bits/key = "
+                              << static_cast<double>(F.num_bits()) / F.num_keys() << std::endl;
+                }
 
-            assert(bucket_size > lower and bucket_size <= upper);
-            uint64_t pos_in_bucket = -1;
-            uint64_t prev_pos_in_seq = constants::invalid_uint64;
-            for (auto mt : bucket)  //
-            {
-                if (mt.pos_in_seq != prev_pos_in_seq) {
-                    prev_pos_in_seq = mt.pos_in_seq;
-                    ++pos_in_bucket;
+                for (uint64_t i = 0; i != kmers.size(); ++i) {
+                    Kmer kmer = kmers[i];
+                    uint64_t pos = F(kmer);
+                    uint32_t pos_in_bucket = positions_in_bucket[i];
+                    cvb_positions.set(pos, pos_in_bucket);
                 }
-                assert(mt.pos_in_seq >= mt.pos_in_kmer);
+                auto& P = positions[partition_id];
+                cvb_positions.build(P);
 
-                mt.pos_in_seq = d.m_spss.strings_offsets.decode(mt.pos_in_seq).absolute_offset;
-
-                const uint64_t starting_pos_of_super_kmer = mt.pos_in_seq - mt.pos_in_kmer;
-                kmer_iterator<Kmer, bits::bit_vector> it(
-                    d.m_spss.strings, k, Kmer::bits_per_char * starting_pos_of_super_kmer);
-                for (uint64_t i = 0; i != mt.num_kmers_in_super_kmer; ++i) {
-                    auto kmer = it.get();
-                    if (build_config.canonical) { /* take the canonical kmer */
-                        auto kmer_rc = kmer;
-                        kmer_rc.reverse_complement_inplace(k);
-                        kmer = std::min(kmer, kmer_rc);
-                    }
-                    kmers.push_back(kmer);
-                    positions_in_bucket.push_back(pos_in_bucket);
-                    it.next();
+                if (build_config.verbose) {
+                    std::cout << "    built positions[" << partition_id << "] for " << P.size()
+                              << " kmers; bits/key = " << (P.num_bytes() * 8.0) / P.size()
+                              << std::endl;
                 }
-                assert(pos_in_bucket < (1ULL << cvb_positions.width()));
+            }
+
+            /* advance partition state for the next iteration */
+            lower = upper;
+            upper = 2 * lower;
+            num_bits_per_pos += 1;
+            if (partition_id + 1 == num_partitions - 1) {
+                upper = max_bucket_size;
+                num_bits_per_pos = log2_max_bucket_size;
             }
         }
-        assert(partition_id == num_partitions - 1);
 
         d.m_ssi.ski.mphfs = std::move(mphfs);
         d.m_ssi.ski.positions = std::move(positions);

From 63ea2bdae878f0aa42c1bab7a861d1775d4abcc1 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 20:48:42 +0000
Subject: [PATCH 04/32] strings: stream to disk during build, read via small
 windows

The strings bit-vector is the largest in-RAM structure during
construction. This commit moves it onto disk for the build
phase, replacing bits::bit_vector::builder with a new
disk_backed_strings storage:

  - append_bits() during step 1 (encode_strings) writes
    completed words to a tmp file, keeping only a small write
    window in RAM (~512 KiB by default).

  - After freeze(), make_reader() returns a forward-monotonic
    reader over the file with a small read window (~512 KiB).
    Each reader owns its own ifstream so multiple threads can
    read concurrently without contention on the writer.

  - The reader exposes get_word64(pos) const, matching the
    interface that kmer_iterator<Kmer, BitVector> expects.

Wiring:
  - compute_minimizer_tuples (step 2): each thread instantiates
    its own reader; per-thread RSS contribution is bounded by
    its window size, not by the strings size.
  - build_sparse_and_skew_index (step 7.1): the in-RAM
    d.m_spss.strings is no longer populated here; step 7.2
    phase (B) reads via a reader instead.
  - A new step 8 materializes d.m_spss.strings from the on-disk
    file, immediately before the standard essentials::save path.
    This brings strings briefly back into RAM at the very end of
    the build; eliminating that final peak requires a streaming
    save path, which is a separate change.

Cleanup: the dictionary_builder destructor removes the strings
tmp file if it still exists (covers exception paths). The hash
in step 5's RAM_taken_in_bytes calculation no longer counts
strings (its in-RAM footprint is now just the writer window).

Verified via `sshash build --check` on:
  - salmonella_enterica m=7 (heavy skew index, regular and
    canonical),
  - salmonella_100 m=11 -t 4 (multi-thread, 13M kmers).
Save -> load -> query roundtrip on the same dataset matches
100% on the canonical query set.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 include/builder/dictionary_builder.hpp      |  35 ++-
 include/builder/disk_backed_strings.hpp     | 282 ++++++++++++++++++++
 src/builder/build_sparse_and_skew_index.cpp |   7 +-
 src/builder/compute_minimizer_tuples.cpp    |   3 +-
 4 files changed, 320 insertions(+), 7 deletions(-)
 create mode 100644 include/builder/disk_backed_strings.hpp

diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp
index dea89a5..77e2c17 100644
--- a/include/builder/dictionary_builder.hpp
+++ b/include/builder/dictionary_builder.hpp
@@ -4,6 +4,7 @@
 #include "include/dictionary.hpp"
 #include "include/offsets.hpp"
 #include "include/builder/util.hpp"
+#include "include/builder/disk_backed_strings.hpp"
 #include "include/buckets_statistics.hpp"
 
 namespace sshash {
@@ -12,7 +13,13 @@ template <typename Kmer, typename Offsets>
 struct dictionary_builder  //
 {
     dictionary_builder(build_configuration const& build_config)
-        : build_config(build_config), num_kmers(0), minimizers(build_config), total_time_musec(0) {}
+        : build_config(build_config)
+        , num_kmers(0)
+        , minimizers(build_config)
+        , strings_run_id(pthash::clock_type::now().time_since_epoch().count())
+        , total_time_musec(0) {}
+
+    ~dictionary_builder() { strings_builder.remove_file(); }
 
     void build(dictionary<Kmer, Offsets>& d, std::string const& filename)  //
     {
@@ -32,8 +39,16 @@ struct dictionary_builder  //
 
         total_time_musec = 0;
 
+        {
+            std::stringstream ss;
+            ss << build_config.tmp_dirname << "/sshash.tmp.run_" << strings_run_id
+               << ".strings.bin";
+            strings_builder.open_for_writing(ss.str());
+        }
+
         do_step("step 1 (encode strings)", [&]() {
             encode_strings(filename);
+            strings_builder.freeze();
             d.m_num_kmers = num_kmers;
             assert(strings_offsets_builder.size() >= 2);
             d.m_num_strings = strings_offsets_builder.size() - 1;
@@ -66,6 +81,14 @@ struct dictionary_builder  //
             assert(strings_offsets_builder.size() == 0);
         });
 
+        /* The build above keeps `strings` exclusively on disk (accessed via
+           `disk_backed_strings::reader` windows). Materialize the in-RAM
+           bit_vector now for the standard `essentials::save` path. */
+        do_step("step 8 (materialize strings to RAM)", [&]() {
+            strings_builder.load_into(d.m_spss.strings);
+            strings_builder.remove_file();
+        });
+
         if (build_config.verbose) {
             print_time(total_time_musec, "total time");
             d.print_space_breakdown();
@@ -82,9 +105,11 @@ struct dictionary_builder  //
     uint64_t num_kmers;
     minimizers_tuples minimizers;
     typename Offsets::builder strings_offsets_builder;
-    bits::bit_vector::builder strings_builder;
+    disk_backed_strings strings_builder;
     weights::builder weights_builder;
 
+    uint64_t strings_run_id;
+
     essentials::timer_type timer;
     essentials::json_lines build_stats;
     uint64_t total_time_musec;
@@ -134,8 +159,10 @@ struct dictionary_builder  //
 
         uint64_t RAM_available_in_bytes = essentials::GiB / 2;  // at least 0.5 GB
         {
-            const uint64_t RAM_taken_in_bytes = (f.num_bits() + strings_builder.num_bits()) / 8 +
-                                                strings_offsets_builder.num_bytes();
+            /* `strings_builder` is now disk-backed; its in-RAM footprint is
+               bounded by its window size, not by the strings size. */
+            const uint64_t RAM_taken_in_bytes =
+                f.num_bits() / 8 + strings_offsets_builder.num_bytes();
             const uint64_t RAM_limit_in_bytes = build_config.ram_limit_in_GiB * essentials::GiB;
             if (RAM_limit_in_bytes > RAM_taken_in_bytes) {
                 RAM_available_in_bytes = std::max<uint64_t>(RAM_limit_in_bytes - RAM_taken_in_bytes,
diff --git a/include/builder/disk_backed_strings.hpp b/include/builder/disk_backed_strings.hpp
new file mode 100644
index 0000000..7ec7e46
--- /dev/null
+++ b/include/builder/disk_backed_strings.hpp
@@ -0,0 +1,282 @@
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <fstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "external/pthash/external/bits/include/bit_vector.hpp"
+
+namespace sshash {
+
+/*
+    Disk-backed storage for the SSHash `strings` bit-vector.
+
+    During step 1 (encode_strings) bits are appended via `append_bits`.
+    Internally only the trailing words are kept in RAM (a small "write
+    window"); completed words are flushed to a tmp file. RAM usage of the
+    writer is bounded by the window size, independently of the total
+    bit-vector size.
+
+    After `freeze()`, callers create one or more `reader`s. Each reader owns
+    an ifstream and a small in-RAM read window, and supports
+    forward-monotonic `get_word64(bit_pos)` reads. The reader matches the
+    interface that `kmer_iterator<Kmer, BitVector>` expects.
+
+    For the standard dictionary save path, `load_into(bits::bit_vector&)`
+    materializes the full bit-vector in RAM (peaks briefly at strings size).
+*/
+struct disk_backed_strings {
+    static constexpr uint64_t default_writer_buffer_words = uint64_t(1) << 16;  // 512 KiB
+    static constexpr uint64_t default_reader_window_words = uint64_t(1) << 16;  // 512 KiB
+
+    disk_backed_strings()
+        : m_num_bits(0)
+        , m_writer_buffer_words(default_writer_buffer_words)
+        , m_words_on_disk(0)
+        , m_frozen(false) {}
+
+    disk_backed_strings(disk_backed_strings const&) = delete;
+    disk_backed_strings& operator=(disk_backed_strings const&) = delete;
+
+    /* Open `filename` for writing; truncates any existing contents. */
+    void open_for_writing(std::string const& filename,
+                          uint64_t writer_buffer_words = default_writer_buffer_words) {
+        m_filename = filename;
+        m_writer_buffer_words = std::max<uint64_t>(2, writer_buffer_words);
+        m_num_bits = 0;
+        m_words_on_disk = 0;
+        m_frozen = false;
+        m_buf.clear();
+        m_buf.reserve(m_writer_buffer_words);
+        m_writer.open(m_filename, std::ofstream::binary | std::ofstream::trunc);
+        if (!m_writer.is_open()) {
+            throw std::runtime_error("cannot open strings tmp file '" + m_filename + "'");
+        }
+    }
+
+    /* No-op: kept for source-compatibility with bits::bit_vector::builder. */
+    void reserve(uint64_t /*num_bits*/) {}
+
+    /* Append `len` bits (`len` <= 64) from `bits`. Same semantics as
+       bits::bit_vector::builder::append_bits. */
+    void append_bits(uint64_t bits, uint64_t len) {
+        assert(len <= 64);
+        assert(len == 64 || (bits >> len) == 0);
+        if (!len) return;
+        const uint64_t pos_in_word = m_num_bits & 63;
+        m_num_bits += len;
+        if (pos_in_word == 0) {
+            m_buf.push_back(bits);
+        } else {
+            m_buf.back() |= bits << pos_in_word;
+            if (len > 64 - pos_in_word) m_buf.push_back(bits >> (64 - pos_in_word));
+        }
+        if (m_buf.size() > m_writer_buffer_words) flush_completed_words();
+    }
+
+    /* Flush any remaining buffered words and close the writer. After this,
+       the file is ready for `make_reader()` and `load_into()`. */
+    void freeze() {
+        if (m_frozen) return;
+        if (!m_buf.empty()) {
+            m_writer.write(reinterpret_cast<char const*>(m_buf.data()),
+                           m_buf.size() * sizeof(uint64_t));
+            m_words_on_disk += m_buf.size();
+            m_buf.clear();
+            m_buf.shrink_to_fit();
+        }
+        m_writer.close();
+        m_frozen = true;
+    }
+
+    uint64_t num_bits() const { return m_num_bits; }
+    std::string const& filename() const { return m_filename; }
+    bool frozen() const { return m_frozen; }
+
+    /*
+        Forward-monotonic reader over the strings file.
+
+        `get_word64(bit_pos)` returns the 64-bit word starting at bit
+        position `bit_pos`. Successive calls must satisfy a forward-monotonic
+        access pattern in word units (calling code may seek forward via
+        `at()`-style calls in `kmer_iterator`, but never backward). Reads
+        past the end-of-file are returned as zero (matches the sentinel
+        zero-padding the SSHash builder writes at the tail of `strings`).
+    */
+    struct reader {
+        reader() = default;
+        reader(reader&& other) noexcept { move_from(std::move(other)); }
+        reader& operator=(reader&& other) noexcept {
+            if (this != &other) {
+                close();
+                move_from(std::move(other));
+            }
+            return *this;
+        }
+        reader(reader const&) = delete;
+        reader& operator=(reader const&) = delete;
+        ~reader() { close(); }
+
+        void open(std::string const& filename, uint64_t num_bits,
+                  uint64_t window_capacity_words = default_reader_window_words) {
+            m_num_bits = num_bits;
+            m_total_words = (num_bits + 63) / 64;
+            m_window_capacity = std::max<uint64_t>(2, window_capacity_words);
+            m_window.assign(m_window_capacity, 0);
+            m_window_size = 0;
+            m_window_start_word = 0;
+            m_in.open(filename, std::ifstream::binary);
+            if (!m_in.is_open()) {
+                throw std::runtime_error("cannot open strings tmp file '" + filename + "'");
+            }
+            seek_window_to(0);
+        }
+
+        bool is_open() const { return m_in.is_open(); }
+
+        void close() {
+            if (m_in.is_open()) m_in.close();
+            m_window.clear();
+            m_window.shrink_to_fit();
+            m_window_size = 0;
+            m_window_start_word = 0;
+        }
+
+        uint64_t num_bits() const { return m_num_bits; }
+
+        uint64_t get_word64(uint64_t bit_pos) const {
+            const uint64_t block = bit_pos >> 6;
+            const uint64_t shift = bit_pos & 63;
+            ensure_window_covers(block);
+            uint64_t a = (block >= m_window_start_word &&
+                          block < m_window_start_word + m_window_size)
+                             ? m_window[block - m_window_start_word]
+                             : uint64_t(0);
+            uint64_t word = a >> shift;
+            if (shift) {
+                const uint64_t next = block + 1;
+                uint64_t b = (next >= m_window_start_word &&
+                              next < m_window_start_word + m_window_size)
+                                 ? m_window[next - m_window_start_word]
+                                 : uint64_t(0);
+                word |= b << (64 - shift);
+            }
+            return word;
+        }
+
+    private:
+        mutable std::ifstream m_in;
+        uint64_t m_num_bits = 0;
+        uint64_t m_total_words = 0;
+        mutable std::vector<uint64_t> m_window;
+        uint64_t m_window_capacity = 0;
+        mutable uint64_t m_window_size = 0;
+        mutable uint64_t m_window_start_word = 0;
+
+        void seek_window_to(uint64_t target_word) const {
+            m_window_start_word = target_word;
+            if (target_word >= m_total_words) {
+                m_window_size = 0;
+                return;
+            }
+            m_in.clear();  // clear any prior eof
+            m_in.seekg(static_cast<std::streamoff>(target_word * sizeof(uint64_t)),
+                       std::ios::beg);
+            const uint64_t to_read = std::min(m_window_capacity, m_total_words - target_word);
+            m_in.read(reinterpret_cast<char*>(m_window.data()),
+                      static_cast<std::streamsize>(to_read * sizeof(uint64_t)));
+            const std::streamsize nread = m_in.gcount();
+            m_window_size = static_cast<uint64_t>(nread) / sizeof(uint64_t);
+        }
+
+        void ensure_window_covers(uint64_t block) const {
+            // We may need both `block` and `block + 1` (for cross-word shifts).
+            // The window covers [m_window_start_word, m_window_start_word + m_window_size).
+            const uint64_t need_end = block + 2;  // exclusive
+            if (block >= m_window_start_word && need_end <= m_window_start_word + m_window_size) {
+                return;
+            }
+            // Slide forward (backward seeks are not supported).
+            seek_window_to(block);
+        }
+
+        void move_from(reader&& other) {
+            m_in = std::move(other.m_in);
+            m_num_bits = other.m_num_bits;
+            m_total_words = other.m_total_words;
+            m_window = std::move(other.m_window);
+            m_window_capacity = other.m_window_capacity;
+            m_window_size = other.m_window_size;
+            m_window_start_word = other.m_window_start_word;
+            other.m_num_bits = 0;
+            other.m_total_words = 0;
+            other.m_window_capacity = 0;
+            other.m_window_size = 0;
+            other.m_window_start_word = 0;
+        }
+    };
+
+    /* Create a new reader over the frozen file. */
+    reader make_reader(uint64_t window_capacity_words = default_reader_window_words) const {
+        if (!m_frozen) {
+            throw std::runtime_error("disk_backed_strings: must freeze() before make_reader()");
+        }
+        reader r;
+        r.open(m_filename, m_num_bits, window_capacity_words);
+        return r;
+    }
+
+    /*
+        Materialize the full bit-vector in RAM. This briefly peaks at the
+        bit-vector size and is used immediately before `essentials::save`.
+    */
+    void load_into(bits::bit_vector& bv) const {
+        if (!m_frozen) {
+            throw std::runtime_error("disk_backed_strings: must freeze() before load_into()");
+        }
+        bits::bit_vector::builder b(m_num_bits);
+        auto& data_vec = b.data();
+        const uint64_t total_words = (m_num_bits + 63) / 64;
+        if (total_words > 0) {
+            std::ifstream in(m_filename, std::ifstream::binary);
+            if (!in.is_open()) {
+                throw std::runtime_error("cannot open strings tmp file '" + m_filename + "'");
+            }
+            in.read(reinterpret_cast<char*>(data_vec.data()),
+                    static_cast<std::streamsize>(total_words * sizeof(uint64_t)));
+            in.close();
+        }
+        b.build(bv);
+    }
+
+    /* Delete the on-disk strings file. */
+    void remove_file() {
+        if (!m_filename.empty()) std::remove(m_filename.c_str());
+    }
+
+private:
+    std::string m_filename;
+    std::ofstream m_writer;
+    uint64_t m_num_bits;
+    std::vector<uint64_t> m_buf;
+    uint64_t m_writer_buffer_words;
+    uint64_t m_words_on_disk;
+    bool m_frozen;
+
+    void flush_completed_words() {
+        if (m_buf.size() < 2) return;
+        const uint64_t to_flush = m_buf.size() - 1;  // keep last (possibly partial) word
+        m_writer.write(reinterpret_cast<char const*>(m_buf.data()),
+                       static_cast<std::streamsize>(to_flush * sizeof(uint64_t)));
+        m_words_on_disk += to_flush;
+        m_buf[0] = m_buf.back();
+        m_buf.resize(1);
+    }
+};
+
+}  // namespace sshash
diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp
index 1720274..9d8551b 100644
--- a/src/builder/build_sparse_and_skew_index.cpp
+++ b/src/builder/build_sparse_and_skew_index.cpp
@@ -110,7 +110,9 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     control_codewords_builder.resize(num_minimizers, num_bits_for_control);
 
     strings_offsets_builder.build(d.m_spss.strings_offsets);
-    strings_builder.build(d.m_spss.strings);
+    /* `d.m_spss.strings` is materialized later, in step 8, from the on-disk
+       strings tmp file owned by `strings_builder`. Step 7.2 phase (B) reads
+       directly from the file via a `disk_backed_strings::reader` window. */
 
     /* step 1. build sparse index */
     assert(buckets_stats.num_buckets() == num_minimizers);
@@ -451,7 +453,8 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
 
         const uint64_t k = build_config.k;
         const bool canonical = build_config.canonical;
-        kmer_iterator<Kmer, bits::bit_vector> kmer_it(d.m_spss.strings, k);
+        auto strings_reader = strings_builder.make_reader();
+        kmer_iterator<Kmer, disk_backed_strings::reader> kmer_it(strings_reader, k);
 
         while (merger.has_next())  //
         {
diff --git a/src/builder/compute_minimizer_tuples.cpp b/src/builder/compute_minimizer_tuples.cpp
index 94916d6..8458857 100644
--- a/src/builder/compute_minimizer_tuples.cpp
+++ b/src/builder/compute_minimizer_tuples.cpp
@@ -47,7 +47,8 @@ void dictionary_builder<Kmer, Offsets>::compute_minimizer_tuples()  //
             const uint64_t index_end =
                 std::min<uint64_t>(index_begin + num_sequences_per_thread, num_sequences);
 
-            kmer_iterator<Kmer, bits::bit_vector::builder> kmer_it(strings_builder, k);
+            auto strings_reader = strings_builder.make_reader();
+            kmer_iterator<Kmer, disk_backed_strings::reader> kmer_it(strings_reader, k);
             hasher_type hasher(build_config.seed);
             minimizer_iterator<Kmer> minimizer_it(k, m, hasher);
             minimizer_iterator_rc<Kmer> minimizer_it_rc(k, m, hasher);

From 54e98ebe0f52a439965686bd5ed4f05d04e721ab Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 21:01:32 +0000
Subject: [PATCH 05/32] streaming dictionary save (no full strings in RAM at
 any point)

Introduces a streaming-save build path that writes the entire
dictionary to disk without ever materializing the strings
bit-vector in RAM. Combined with the disk-backed strings
storage from the previous commit, this means the strings live
exclusively on disk for the lifetime of the build, eliminating
the final RAM peak that step 8 previously reintroduced just
before essentials::save.

Mechanism:
  - disk_backed_strings::save_to(ostream&) emits the same byte
    layout as bits::bit_vector's serialization (uint64_t
    m_num_bits, size_t n, then n*8 bytes), reading the words
    from the tmp file in 64 KiB chunks.

  - streaming_strings_saver wraps essentials::generic_saver and
    overrides visit() for bits::bit_vector: when the visited
    instance matches a known address (d.m_spss.strings), the
    streaming serializer is invoked; everything else goes
    through the normal essentials path. Address matching avoids
    introducing a marker type into bits::bit_vector.

API:
  - dictionary_builder::build() (existing) materializes strings
    in RAM at the end so the dictionary is query-ready
    (--check, etc.).
  - dictionary_builder::build_streaming_save() runs steps 1-7
    and stream-saves directly to the output file, leaving
    d.m_spss.strings empty. The dictionary is *not*
    query-ready after this; reload from disk to query.
  - dictionary::build_streaming_save() exposes the new flow.

tools/build.cpp uses the streaming-save path automatically when
-o is given without --check; otherwise it falls back to the
materializing path so --check can run queries against d.

Verified by building the same input via both paths and
diff'ing the output file: byte-identical for regular,
--canonical, --weighted, and 4-thread builds. Loading the
streaming-saved file via essentials::load and running the
standalone `sshash check` and `sshash query` tools
returns "EVERYTHING OK!" and 100% positive matches.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 include/builder/dictionary_builder.hpp  |  78 ++++++++++++------
 include/builder/disk_backed_strings.hpp |  46 +++++++++++
 include/builder/streaming_save.hpp      | 101 ++++++++++++++++++++++++
 include/dictionary.hpp                  |  13 ++-
 src/builder/build.cpp                   |  43 +++++++---
 tools/build.cpp                         |  43 ++++++----
 6 files changed, 271 insertions(+), 53 deletions(-)
 create mode 100644 include/builder/streaming_save.hpp

diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp
index 77e2c17..3c4c5fe 100644
--- a/include/builder/dictionary_builder.hpp
+++ b/include/builder/dictionary_builder.hpp
@@ -5,6 +5,7 @@
 #include "include/offsets.hpp"
 #include "include/builder/util.hpp"
 #include "include/builder/disk_backed_strings.hpp"
+#include "include/builder/streaming_save.hpp"
 #include "include/buckets_statistics.hpp"
 
 namespace sshash {
@@ -21,8 +22,55 @@ struct dictionary_builder  //
 
     ~dictionary_builder() { strings_builder.remove_file(); }
 
-    void build(dictionary<Kmer, Offsets>& d, std::string const& filename)  //
+    /*
+        Build a query-ready dictionary in `d`. After this returns,
+        `d.m_spss.strings` is materialized in RAM (peak briefly equals the
+        strings size). Use this when the caller needs to query `d` post-build
+        (e.g., `--check`).
+    */
+    void build(dictionary<Kmer, Offsets>& d, std::string const& filename) {
+        run_steps_1_through_7(d, filename);
+        do_step("step 8 (materialize strings to RAM)", [&]() {
+            strings_builder.load_into(d.m_spss.strings);
+            strings_builder.remove_file();
+        });
+        finalize_stats(d);
+    }
+
+    /*
+        Build the dictionary and stream-save it to `output_filename` without
+        ever materializing `strings` in RAM. After this returns, `d` is *not*
+        query-ready (`d.m_spss.strings` is empty). Use this when the caller
+        only needs the on-disk index file and wants to keep peak RAM bounded
+        by the build phase.
+    */
+    void build_streaming_save(dictionary<Kmer, Offsets>& d,                  //
+                              std::string const& filename,                   //
+                              std::string const& output_filename)            //
     {
+        run_steps_1_through_7(d, filename);
+        do_step("step 8 (stream-save dictionary to disk)", [&]() {
+            save_streaming(d, output_filename.c_str(), &d.m_spss.strings, strings_builder);
+            strings_builder.remove_file();
+        });
+        finalize_stats(d);
+    }
+
+    build_configuration build_config;
+    uint64_t num_kmers;
+    minimizers_tuples minimizers;
+    typename Offsets::builder strings_offsets_builder;
+    disk_backed_strings strings_builder;
+    weights::builder weights_builder;
+
+    uint64_t strings_run_id;
+
+    essentials::timer_type timer;
+    essentials::json_lines build_stats;
+    uint64_t total_time_musec;
+
+private:
+    void run_steps_1_through_7(dictionary<Kmer, Offsets>& d, std::string const& filename) {
         d.m_k = build_config.k;
         d.m_m = build_config.m;
         d.m_spss.k = build_config.k;
@@ -80,18 +128,14 @@ struct dictionary_builder  //
             minimizers.remove_tmp_file();
             assert(strings_offsets_builder.size() == 0);
         });
+    }
 
-        /* The build above keeps `strings` exclusively on disk (accessed via
-           `disk_backed_strings::reader` windows). Materialize the in-RAM
-           bit_vector now for the standard `essentials::save` path. */
-        do_step("step 8 (materialize strings to RAM)", [&]() {
-            strings_builder.load_into(d.m_spss.strings);
-            strings_builder.remove_file();
-        });
-
+    void finalize_stats(dictionary<Kmer, Offsets>& d) {
         if (build_config.verbose) {
             print_time(total_time_musec, "total time");
-            d.print_space_breakdown();
+            /* `print_space_breakdown` reads d.m_spss.strings; only safe in
+               the materialize-to-RAM flow. */
+            if (d.m_spss.strings.num_bits() > 0) d.print_space_breakdown();
         }
 
         build_stats.add("total_build_time_in_microsec", total_time_musec);
@@ -101,20 +145,6 @@ struct dictionary_builder  //
         if (build_config.verbose) build_stats.print();
     }
 
-    build_configuration build_config;
-    uint64_t num_kmers;
-    minimizers_tuples minimizers;
-    typename Offsets::builder strings_offsets_builder;
-    disk_backed_strings strings_builder;
-    weights::builder weights_builder;
-
-    uint64_t strings_run_id;
-
-    essentials::timer_type timer;
-    essentials::json_lines build_stats;
-    uint64_t total_time_musec;
-
-private:
     void print_time(double time_in_musec, std::string const& message) {
         std::cout << "=== " << message << ": " << time_in_musec / 1'000'000 << " [sec] ("
                   << (time_in_musec * 1000) / num_kmers << " [ns/kmer])" << std::endl;
diff --git a/include/builder/disk_backed_strings.hpp b/include/builder/disk_backed_strings.hpp
index 7ec7e46..dabafe9 100644
--- a/include/builder/disk_backed_strings.hpp
+++ b/include/builder/disk_backed_strings.hpp
@@ -231,6 +231,52 @@ struct disk_backed_strings {
         return r;
     }
 
+    /*
+        Stream the strings to `os` in the same byte format that
+        `essentials::generic_saver::visit(bits::bit_vector const&)` would
+        produce — i.e.,
+            uint64_t m_num_bits;
+            size_t   n;            // number of 64-bit words
+            uint64_t m_data[n];
+        — without ever materializing the full bit-vector in RAM. The bytes
+        are read from the tmp file in fixed-size chunks.
+
+        This relies on `bits::bit_vector::visit_impl` writing exactly two
+        fields (`m_num_bits` and the `m_data` owning_span<uint64_t>) and on
+        `generic_saver::visit_seq` writing `size_t n` followed by the raw
+        `n * sizeof(uint64_t)` bytes. If `bits::bit_vector` ever changes its
+        on-disk representation, this method must be updated to match.
+    */
+    void save_to(std::ostream& os) const {
+        if (!m_frozen) {
+            throw std::runtime_error("disk_backed_strings: must freeze() before save_to()");
+        }
+        const uint64_t num_bits = m_num_bits;
+        os.write(reinterpret_cast<char const*>(&num_bits), sizeof(uint64_t));
+        const uint64_t total_words = (num_bits + 63) / 64;
+        const std::size_t n = static_cast<std::size_t>(total_words);
+        os.write(reinterpret_cast<char const*>(&n), sizeof(std::size_t));
+        if (total_words == 0) return;
+        std::ifstream in(m_filename, std::ifstream::binary);
+        if (!in.is_open()) {
+            throw std::runtime_error("cannot open strings tmp file '" + m_filename + "'");
+        }
+        std::vector<char> buffer(uint64_t(64) << 10);  // 64 KiB
+        uint64_t bytes_remaining = total_words * sizeof(uint64_t);
+        while (bytes_remaining > 0) {
+            const std::streamsize chunk = static_cast<std::streamsize>(
+                std::min<uint64_t>(buffer.size(), bytes_remaining));
+            in.read(buffer.data(), chunk);
+            const std::streamsize got = in.gcount();
+            if (got <= 0) {
+                throw std::runtime_error("unexpected EOF in strings tmp file '" + m_filename + "'");
+            }
+            os.write(buffer.data(), got);
+            bytes_remaining -= static_cast<uint64_t>(got);
+        }
+        in.close();
+    }
+
     /*
         Materialize the full bit-vector in RAM. This briefly peaks at the
         bit-vector size and is used immediately before `essentials::save`.
diff --git a/include/builder/streaming_save.hpp b/include/builder/streaming_save.hpp
new file mode 100644
index 0000000..73e4315
--- /dev/null
+++ b/include/builder/streaming_save.hpp
@@ -0,0 +1,101 @@
+#pragma once
+
+#include <fstream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "essentials.hpp"
+#include "external/pthash/external/bits/include/bit_vector.hpp"
+
+#include "include/builder/disk_backed_strings.hpp"
+
+namespace sshash {
+
+/*
+    A saver that mirrors `essentials::generic_saver`, except that any visit
+    to a specific `bits::bit_vector` instance (identified by address) is
+    redirected to `disk_backed_strings::save_to`, which streams the strings
+    bytes from the on-disk tmp file. All other visits go through the regular
+    `essentials` path.
+
+    Using address-based identification means we don't need to add any
+    intermediate type or marker to `bits::bit_vector` itself.
+*/
+struct streaming_strings_saver {
+    streaming_strings_saver(std::ostream& os,                                 //
+                            bits::bit_vector const* strings_addr,             //
+                            disk_backed_strings const* strings_storage)       //
+        : m_os(os), m_strings_addr(strings_addr), m_strings_storage(strings_storage) {
+        if (m_strings_addr == nullptr || m_strings_storage == nullptr) {
+            throw std::runtime_error("streaming_strings_saver requires non-null arguments");
+        }
+    }
+
+    template <typename T>
+    void visit(T const& val) {
+        if constexpr (std::is_same_v<T, bits::bit_vector>) {
+            if (&val == m_strings_addr) {
+                m_strings_storage->save_to(m_os);
+                return;
+            }
+        }
+        if constexpr (essentials::is_pod<T>::value) {
+            essentials::save_pod(m_os, val);
+        } else {
+            val.visit(*this);
+        }
+    }
+
+    template <typename T, typename Allocator>
+    void visit(std::vector<T, Allocator> const& vec) {
+        visit_seq(vec);
+    }
+
+    template <typename T>
+    void visit(essentials::owning_span<T> const& vec) {
+        visit_seq(vec);
+    }
+
+    std::size_t bytes() { return static_cast<std::size_t>(m_os.tellp()); }
+
+private:
+    std::ostream& m_os;
+    bits::bit_vector const* m_strings_addr;
+    disk_backed_strings const* m_strings_storage;
+
+    template <typename Vec>
+    void visit_seq(Vec const& vec) {
+        using T = typename Vec::value_type;
+        const std::size_t n = vec.size();
+        visit(n);
+        if constexpr (essentials::is_pod<T>::value) {
+            m_os.write(reinterpret_cast<char const*>(vec.data()),
+                       static_cast<std::streamsize>(sizeof(T) * n));
+        } else {
+            for (auto const& v : vec) visit(v);
+        }
+    }
+};
+
+/*
+    Save `t` to `filename`, streaming any embedded `bits::bit_vector` whose
+    address matches `strings_addr` from `strings_storage` instead of from
+    RAM. Other fields are saved using the standard `essentials` path.
+*/
+template <typename T>
+void save_streaming(T const& t, char const* filename,                    //
+                    bits::bit_vector const* strings_addr,                //
+                    disk_backed_strings const& strings_storage)          //
+{
+    std::ofstream out(filename, std::ios::binary);
+    if (!out.good()) {
+        throw std::runtime_error(std::string("error opening file '") + filename + "' for writing");
+    }
+    streaming_strings_saver saver(out, strings_addr, &strings_storage);
+    saver.visit(t);
+    out.close();
+}
+
+}  // namespace sshash
diff --git a/include/dictionary.hpp b/include/dictionary.hpp
index a30b8c4..7790efb 100644
--- a/include/dictionary.hpp
+++ b/include/dictionary.hpp
@@ -25,9 +25,20 @@ struct dictionary  //
         , m_m(0)
         , m_canonical(false) {}
 
-    /* Build from input file. */
+    /* Build from input file. After this returns, `*this` is query-ready. */
     void build(std::string const& input_filename, build_configuration const& build_config);
 
+    /*
+        Build from input file and stream-save the resulting dictionary to
+        `output_filename`. The strings bit-vector is never materialized in
+        RAM during construction, so peak RAM is bounded by the build phase
+        only. After this returns, `*this` is *not* query-ready
+        (`m_spss.strings` is empty); reload via `essentials::load` to query.
+    */
+    void build_streaming_save(std::string const& input_filename,
+                              build_configuration const& build_config,
+                              std::string const& output_filename);
+
     essentials::version_number vnum() const { return m_vnum; }
     uint64_t num_kmers() const { return m_num_kmers; }
     uint64_t num_strings() const { return m_num_strings; }
diff --git a/src/builder/build.cpp b/src/builder/build.cpp
index e9eed1d..76d0b97 100644
--- a/src/builder/build.cpp
+++ b/src/builder/build.cpp
@@ -6,25 +6,42 @@
 
 namespace sshash {
 
+namespace {
+
+inline void validate_build_config_or_throw(build_configuration const& bc, uint64_t max_k,
+                                           uint64_t max_m) {
+    if (bc.k == 0) throw std::runtime_error("k must be > 0");
+    if (bc.k > max_k) {
+        throw std::runtime_error("k must be less <= " + std::to_string(max_k) +
+                                 " but got k = " + std::to_string(bc.k));
+    }
+    if (bc.m == 0) throw std::runtime_error("m must be > 0");
+    if (bc.m > max_m) {
+        throw std::runtime_error("m must be less <= " + std::to_string(max_m) +
+                                 " but got m = " + std::to_string(bc.m));
+    }
+    if (bc.m > bc.k) throw std::runtime_error("m must be <= k");
+}
+
+}  // namespace
+
 template <typename Kmer, typename Offsets>
 void dictionary<Kmer, Offsets>::build(std::string const& filename,
                                       build_configuration const& build_config)  //
 {
-    /* Validate the build configuration. */
-    if (build_config.k == 0) throw std::runtime_error("k must be > 0");
-    if (build_config.k > Kmer::max_k) {
-        throw std::runtime_error("k must be less <= " + std::to_string(Kmer::max_k) +
-                                 " but got k = " + std::to_string(build_config.k));
-    }
-    if (build_config.m == 0) throw std::runtime_error("m must be > 0");
-    if (build_config.m > Kmer::max_m) {
-        throw std::runtime_error("m must be less <= " + std::to_string(Kmer::max_m) +
-                                 " but got m = " + std::to_string(build_config.m));
-    }
-    if (build_config.m > build_config.k) throw std::runtime_error("m must be <= k");
-
+    validate_build_config_or_throw(build_config, Kmer::max_k, Kmer::max_m);
     dictionary_builder<Kmer, Offsets> builder(build_config);
     builder.build(*this, filename);
 }
 
+template <typename Kmer, typename Offsets>
+void dictionary<Kmer, Offsets>::build_streaming_save(
+    std::string const& input_filename, build_configuration const& build_config,
+    std::string const& output_filename)  //
+{
+    validate_build_config_or_throw(build_config, Kmer::max_k, Kmer::max_m);
+    dictionary_builder<Kmer, Offsets> builder(build_config);
+    builder.build_streaming_save(*this, input_filename, output_filename);
+}
+
 }  // namespace sshash
diff --git a/tools/build.cpp b/tools/build.cpp
index 6630386..fbc7d1d 100644
--- a/tools/build.cpp
+++ b/tools/build.cpp
@@ -73,25 +73,38 @@ int build(int argc, char** argv) {
 
     // build_config.print();
 
-    essentials::logger("building data structure...");
-    dictionary_type dict;
-    dict.build(input_filename, build_config);
-
     bool check = parser.get<bool>("check");
-    if (check) {
-        check_correctness_lookup_access(dict, input_filename);
-        check_correctness_navigational_kmer_query(dict, input_filename);
-        check_correctness_navigational_string_query(dict);
-        if (build_config.weighted) check_correctness_weights(dict, input_filename);
-        check_correctness_kmer_iterator(dict);
-        check_correctness_string_iterator(dict);
-    }
+    bool has_output = parser.parsed("output_filename");
 
-    if (parser.parsed("output_filename")) {
+    dictionary_type dict;
+
+    if (has_output && !check) {
+        /* Streaming-save path: keeps peak RAM bounded by the build phase
+           (the strings bit-vector is never fully in RAM). After this returns
+           `dict` is not query-ready; reload from disk to query. */
         auto output_filename = parser.get<std::string>("output_filename");
-        essentials::logger("saving data structure to disk...");
-        essentials::save(dict, output_filename.c_str());
+        essentials::logger("building data structure (streaming save)...");
+        dict.build_streaming_save(input_filename, build_config, output_filename);
         essentials::logger("DONE");
+    } else {
+        essentials::logger("building data structure...");
+        dict.build(input_filename, build_config);
+
+        if (check) {
+            check_correctness_lookup_access(dict, input_filename);
+            check_correctness_navigational_kmer_query(dict, input_filename);
+            check_correctness_navigational_string_query(dict);
+            if (build_config.weighted) check_correctness_weights(dict, input_filename);
+            check_correctness_kmer_iterator(dict);
+            check_correctness_string_iterator(dict);
+        }
+
+        if (has_output) {
+            auto output_filename = parser.get<std::string>("output_filename");
+            essentials::logger("saving data structure to disk...");
+            essentials::save(dict, output_filename.c_str());
+            essentials::logger("DONE");
+        }
     }
 
     return 0;

From e5d26127427459a1f0cf167b62553668d3c9f43c Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 21:20:08 +0000
Subject: [PATCH 06/32] step 7.1: drop redundant tuples copy; point bucket_type
 into mmap

Previously step 7.1 copied every minimizer tuple of every
non-singleton bucket into an in-RAM vector `tuples`, just to
give bucket_type a contiguous backing store. The mmap'd `input`
already provides exactly that, so the copy was pure overhead
(~18 B per super-kmer in non-singleton buckets, scaling with
the input).

Now bucket_type stores raw minimizer_tuple pointers into
input.data() directly, and `input` is kept open through step
7.2 phase (A) (which is the last consumer). After phase (A)
both `buckets` and `input` are released.

Verified byte-identical output vs the previous commit on
salmonella_enterica m=7, plus full --check on regular,
--canonical, multi-thread (-t 4), and --weighted builds.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 src/builder/build_sparse_and_skew_index.cpp | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp
index 9d8551b..891a61b 100644
--- a/src/builder/build_sparse_and_skew_index.cpp
+++ b/src/builder/build_sparse_and_skew_index.cpp
@@ -137,10 +137,9 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
 
     std::vector<bucket_type> buckets;
     buckets.reserve(num_buckets_larger_than_1_not_in_skew_index + num_buckets_in_skew_index);
-    std::vector<minimizer_tuple> tuples;  // backed memory
-    tuples.reserve(num_super_kmers_in_buckets_larger_than_1);
 
-    // Second pass: collect buckets > 1 for sorting AND handle size-1 buckets
+    /* Second pass: register buckets > 1 (pointing directly into the mmap'd
+       `input`, no copy) and handle size-1 buckets inline. */
     for (minimizers_tuples_iterator it(input.data(), input.data() + input.size());  //
          it.has_next(); it.next())                                                  //
     {
@@ -163,18 +162,14 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
                 }
             }
         } else {
-            // Collect buckets > 1 for later processing
-            minimizer_tuple const* begin = tuples.data() + tuples.size();
-            std::copy(bucket.begin_ptr(), bucket.end_ptr(), std::back_inserter(tuples));
-            minimizer_tuple const* end = tuples.data() + tuples.size();
-            buckets.push_back(bucket_type(begin, end));
+            /* Buckets > 1: store pointers directly into the mmap'd `input`.
+               `input` is kept open through step 7.2 phase (A). */
+            buckets.push_back(bucket_type(bucket.begin_ptr(), bucket.end_ptr()));
         }
     }
     assert(buckets.size() ==
            num_buckets_larger_than_1_not_in_skew_index + num_buckets_in_skew_index);
 
-    input.close();
-
     std::sort(buckets.begin(), buckets.end(),
               [](bucket_type const& x, bucket_type const& y) { return x.size() < y.size(); });
 
@@ -408,6 +403,12 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         assert(partition_id == num_partitions - 1);
     }
 
+    /* `buckets` and the mmap'd `input` are no longer needed: phase (B) walks
+       the sorted requests and per-partition tmp files, phase (C) walks the
+       per-partition tmp files. Free both now to bound RAM. */
+    std::vector<bucket_type>().swap(buckets);
+    input.close();
+
     if (build_config.verbose) {
         uint64_t total_kmers_in_skew = 0;
         for (uint64_t p = 0; p != num_partitions; ++p) {

From 0301201892764e5501e847ba402675fe6197fb3e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 21:29:44 +0000
Subject: [PATCH 07/32] step 7.2 phase C: stream per-partition kmers from disk;
 external-memory MPHF

Phase C used to materialize the full per-skew-partition working
set in RAM:
  - kmers vector (~16 B/kmer, default uint64_t-backed kmer)
  - positions_in_bucket vector (~4 B/kmer)
  - cvb_positions builder (~num_bits_per_pos / 8 B/kmer, the
    actual stored output)

For a partition with N kmers this peaked at ~21 N bytes (e.g.
20 GB for a 1 B-kmer partition). The kmers and
positions_in_bucket vectors were redundant in-RAM copies of
data already on disk in the per-partition tmp file written by
phase (B).

This commit replaces them with two streaming passes over the
tmp file:

  (1) MPHF build via pthash's `build_in_external_memory` driven
      by a small forward iterator (`skew_partition_kmer_iterator`)
      that reads `(kmer.bits, pos_in_bucket)` records via a
      shared_ptr<ifstream>. pthash spills hashes to
      `tmp_dirname` under a `--ram-limit / 2` RAM budget rather
      than holding all keys + hashes simultaneously.

  (2) A second sequential pass over the same tmp file fills
      cvb_positions: for each `(kmer, pib)` record it sets
      cvb_positions[F(kmer)] = pib.

Only cvb_positions itself stays resident through both passes,
and it's the actual stored output (not a transient).

The iterator must be copyable because pthash's
`build_in_external_memory` takes the iterator by value; the
shared_ptr<ifstream> means copies share the underlying stream
state. After the build call returns the original at the call
site is unused, so the shared advancement is harmless.

Verified byte-identical output vs the previous commit on
salmonella_enterica m=7, plus full --check on regular,
--canonical, multi-thread (-t 4), and --weighted builds.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 src/builder/build_sparse_and_skew_index.cpp | 135 ++++++++++++++------
 1 file changed, 99 insertions(+), 36 deletions(-)

diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp
index 891a61b..0a4a3f8 100644
--- a/src/builder/build_sparse_and_skew_index.cpp
+++ b/src/builder/build_sparse_and_skew_index.cpp
@@ -36,6 +36,62 @@ struct kmer_extraction_request {
 };
 #pragma pack(pop)
 
+/*
+    Forward iterator over a per-skew-partition tmp file produced by step
+    7.2 phase (B). Each record is `(kmer.bits, uint32_t pos_in_bucket)`.
+    This iterator yields successive Kmer values, exposing the minimal
+    interface (`*it`, `++it`) that pthash's external-memory partitioned PHF
+    builder consumes.
+
+    pthash takes the iterator by value, so it must be copyable. The
+    underlying `ifstream` is held via `shared_ptr` and shared between
+    copies; pthash's copy advances the shared stream state, which is fine
+    because the original at the call site is no longer used after the
+    build call returns.
+*/
+template <typename Kmer>
+struct skew_partition_kmer_iterator {
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = Kmer;
+    using difference_type = std::ptrdiff_t;
+    using reference = Kmer const&;
+    using pointer = Kmer const*;
+
+    skew_partition_kmer_iterator() = default;
+
+    void open(std::string const& filename) {
+        m_in = std::make_shared<std::ifstream>(filename, std::ifstream::binary);
+        if (!m_in->is_open()) {
+            throw std::runtime_error("cannot open skew-partition tmp file '" + filename + "'");
+        }
+        advance();
+    }
+
+    void close() {
+        if (m_in && m_in->is_open()) m_in->close();
+        m_in.reset();
+    }
+
+    Kmer const& operator*() const { return m_current; }
+    skew_partition_kmer_iterator& operator++() {
+        advance();
+        return *this;
+    }
+
+private:
+    std::shared_ptr<std::ifstream> m_in;
+    Kmer m_current;
+
+    void advance() {
+        decltype(Kmer{}.bits) bits;
+        m_in->read(reinterpret_cast<char*>(&bits), sizeof(bits));
+        if (m_in->gcount() != static_cast<std::streamsize>(sizeof(bits))) return;
+        uint32_t pib;
+        m_in->read(reinterpret_cast<char*>(&pib), sizeof(pib));  // skip pos_in_bucket
+        m_current.bits = bits;
+    }
+};
+
 template <typename Kmer, typename Offsets>
 void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     dictionary<Kmer, Offsets>& d)  //
@@ -504,6 +560,11 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         mphf_build_config.verbose = false;
         mphf_build_config.num_threads = build_config.num_threads;
         mphf_build_config.avg_partition_size = constants::avg_partition_size;
+        /* External-memory PHF: bound RAM by `--ram-limit` and spill hashes
+           to `tmp_dirname` rather than holding the partition's keys
+           (~16 B/kmer) and their hashes simultaneously in RAM. */
+        mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2;
+        mphf_build_config.tmp_dir = build_config.tmp_dirname;
 
         uint64_t lower = min_size;
         uint64_t upper = 2 * lower;
@@ -525,59 +586,61 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
 
             if (n > 0)  //
             {
-                std::vector<Kmer> kmers;
-                std::vector<uint32_t> positions_in_bucket;
-                kmers.reserve(n);
-                positions_in_bucket.reserve(n);
-
-                {
-                    const std::string fn = skew_partition_filename(partition_id);
-                    std::ifstream in(fn, std::ifstream::binary);
-                    if (!in.is_open()) {
-                        throw std::runtime_error("cannot open skew-partition tmp file");
-                    }
-                    for (uint64_t i = 0; i != n; ++i) {
-                        Kmer kmer;
-                        in.read(reinterpret_cast<char*>(&kmer.bits), sizeof(kmer.bits));
-                        uint32_t pib;
-                        in.read(reinterpret_cast<char*>(&pib), sizeof(pib));
-                        kmers.push_back(kmer);
-                        positions_in_bucket.push_back(pib);
-                    }
-                    in.close();
-                    std::remove(fn.c_str());
-                }
-
-                bits::compact_vector::builder cvb_positions;
-                cvb_positions.resize(n, num_bits_per_pos);
+                const std::string fn = skew_partition_filename(partition_id);
 
                 if (build_config.verbose) {
                     const uint64_t avg_partition_size =
-                        pthash::compute_avg_partition_size(kmers.size(), mphf_build_config);
+                        pthash::compute_avg_partition_size(n, mphf_build_config);
                     const uint64_t pthash_num_partitions =
-                        pthash::compute_num_partitions(kmers.size(), avg_partition_size);
+                        pthash::compute_num_partitions(n, avg_partition_size);
                     assert(pthash_num_partitions > 0);
-                    std::cout << "    building MPHF with " << mphf_build_config.num_threads
-                              << " threads and " << pthash_num_partitions
+                    std::cout << "    building MPHF (external memory) with "
+                              << mphf_build_config.num_threads << " threads and "
+                              << pthash_num_partitions
                               << " partitions (avg. partition size = " << avg_partition_size
                               << ")..." << std::endl;
                 }
 
+                /* (1) Build the MPHF by streaming kmers from the partition
+                       file. pthash's external-memory builder spills hashes
+                       to tmp_dir under its own RAM budget; the iterator's
+                       footprint is constant. */
                 auto& F = mphfs[partition_id];
-                F.build_in_internal_memory(kmers.begin(), kmers.size(), mphf_build_config);
+                {
+                    skew_partition_kmer_iterator<Kmer> iter;
+                    iter.open(fn);
+                    F.build_in_external_memory(iter, n, mphf_build_config);
+                    iter.close();
+                }
 
                 if (build_config.verbose) {
-                    std::cout << "    built mphs[" << partition_id << "] for " << kmers.size()
+                    std::cout << "    built mphs[" << partition_id << "] for " << F.num_keys()
                               << " kmers; bits/key = "
                               << static_cast<double>(F.num_bits()) / F.num_keys() << std::endl;
                 }
 
-                for (uint64_t i = 0; i != kmers.size(); ++i) {
-                    Kmer kmer = kmers[i];
-                    uint64_t pos = F(kmer);
-                    uint32_t pos_in_bucket = positions_in_bucket[i];
-                    cvb_positions.set(pos, pos_in_bucket);
+                /* (2) Re-stream the file to fill cvb_positions: for each
+                       (kmer, pos_in_bucket), set cvb_positions[F(kmer)] =
+                       pos_in_bucket. Only cvb_positions itself stays in RAM
+                       (n * num_bits_per_pos bits, the actual stored output). */
+                bits::compact_vector::builder cvb_positions;
+                cvb_positions.resize(n, num_bits_per_pos);
+                {
+                    std::ifstream in(fn, std::ifstream::binary);
+                    if (!in.is_open()) {
+                        throw std::runtime_error("cannot open skew-partition tmp file");
+                    }
+                    for (uint64_t i = 0; i != n; ++i) {
+                        Kmer kmer;
+                        in.read(reinterpret_cast<char*>(&kmer.bits), sizeof(kmer.bits));
+                        uint32_t pib;
+                        in.read(reinterpret_cast<char*>(&pib), sizeof(pib));
+                        cvb_positions.set(F(kmer), pib);
+                    }
+                    in.close();
                 }
+                std::remove(fn.c_str());
+
                 auto& P = positions[partition_id];
                 cvb_positions.build(P);
 

From 5f9ec800b7db6d010a7f8a6bf0dec2ab2ed9d63d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 5 May 2026 02:56:05 +0000
Subject: [PATCH 08/32] step 7.1 + 7.2 phase A: drop mmap; single ifstream pass
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removes the last mmap from the build hot path. The merged
minimizers file used to be opened via mm::file_source and
walked twice — once for stats and once to populate a
size-sorted in-RAM `buckets` vector — and then walked again
in step 7.2 phase (A) via pointers into the mmap. The recent
"point bucket_type into the mmap" change pushed even more
work onto the page cache, which violates the user's hard RAM
budget (mmap pages can be resident up to file size when the
machine has enough memory).

This commit replaces all of that with two sequential
std::ifstream passes (no mmap) over the merged minimizers
file:

  Pass 1 (stats): unchanged in spirit. Uses a new
    `streaming_minimizer_bucket_reader` that buffers one
    bucket at a time (peak ~ max_bucket_size * 18 B). Feeds
    `buckets_statistics` exactly as before.

  Pass 2 (combined sparse + heavy + emit kmer requests):
    folds the former step 7.1 main pass and step 7.2 phase
    (A) into a single bucket-by-bucket loop. The size-sorted
    iteration over `buckets` is gone; instead:

      - `begin_buckets_of_size[s]` is precomputed from the
        bucket-size histogram (new accessor
        `buckets_statistics::num_buckets_of_size`),
      - mid_load positions are written via per-size cursors
        using `compact_vector::builder::set` instead of
        `push_back`,
      - heavy_load positions are appended in file order via a
        single monotone cursor,
      - heavy buckets emit kmer-extraction requests in-line.

The `buckets` vector and the entire `tuples` array are gone.
No memory mapping anywhere in step 7. RAM footprint of step
7.1 is now bounded by:
  - max_bucket_size * 18 B (one bucket at a time),
  - the sparse-index builders being assembled (proportional
    to non-singleton positions, bits-packed),
  - the kmer-extraction request buffer (~ ram_limit / 4).

Output bytes differ from the previous commit because the
ordering of positions inside mid_load_buckets and
heavy_load_buckets is now file order instead of size-sorted
order; the codewords are updated to match. The index is
self-consistent: full --check passes on regular, --canonical,
multi-thread (-t 4), and --weighted builds, and a streaming-
save round-trip via `sshash check` and `sshash query`
returns "EVERYTHING OK!" and 100% positive matches.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 include/buckets_statistics.hpp              |   6 +
 src/builder/build_sparse_and_skew_index.cpp | 510 ++++++++++----------
 2 files changed, 261 insertions(+), 255 deletions(-)

diff --git a/include/buckets_statistics.hpp b/include/buckets_statistics.hpp
index 6f582d7..13676f1 100644
--- a/include/buckets_statistics.hpp
+++ b/include/buckets_statistics.hpp
@@ -59,6 +59,12 @@ struct buckets_statistics {
     uint64_t max_bucket_size() const { return m_max_bucket_size; }
     uint64_t max_sparse_buckets_per_size() const { return m_max_sparse_buckets_per_size; }
 
+    /* Histogram bin: number of buckets whose size equals `s`. Bins beyond
+       MAX_BUCKET_SIZE are not tracked individually and return 0. */
+    uint64_t num_buckets_of_size(uint64_t s) const {
+        return s < m_bucket_sizes.size() ? m_bucket_sizes[s] : uint64_t(0);
+    }
+
     void print_full() const {
         std::cout << "=== bucket statistics (full) === \n";
         for (uint64_t bucket_size = 1, prev_bucket_size = 0, prev_kmers_in_buckets = 0,
diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp
index 0a4a3f8..a45df4e 100644
--- a/src/builder/build_sparse_and_skew_index.cpp
+++ b/src/builder/build_sparse_and_skew_index.cpp
@@ -36,6 +36,55 @@ struct kmer_extraction_request {
 };
 #pragma pack(pop)
 
+/*
+    Streaming reader over the merged minimizers file. Reads minimizer_tuple
+    records via std::ifstream (no mmap), and groups consecutive tuples by
+    minimizer into "buckets" — exactly as `minimizers_tuples_iterator` does
+    over an mmap'd buffer, but with bounded RAM (~ one bucket at a time).
+
+    The caller passes a vector to receive the bucket's tuples; for typical
+    inputs this peaks at max_bucket_size * sizeof(minimizer_tuple).
+*/
+struct streaming_minimizer_bucket_reader {
+    void open(std::string const& filename) {
+        m_in.open(filename, std::ifstream::binary);
+        if (!m_in.is_open()) {
+            throw std::runtime_error("cannot open minimizers tmp file '" + filename + "'");
+        }
+        // Read first record into the lookahead slot, if any.
+        m_in.read(reinterpret_cast<char*>(&m_lookahead), sizeof(minimizer_tuple));
+        m_eof = (m_in.gcount() != static_cast<std::streamsize>(sizeof(minimizer_tuple)));
+    }
+
+    void close() {
+        if (m_in.is_open()) m_in.close();
+    }
+
+    bool has_next_bucket() const { return !m_eof; }
+
+    /* Read the next bucket into `bucket_out` (cleared first). All tuples in
+       a bucket share the same minimizer. Returns the bucket's minimizer. */
+    uint64_t next_bucket(std::vector<minimizer_tuple>& bucket_out) {
+        bucket_out.clear();
+        assert(!m_eof);
+        const uint64_t mm = m_lookahead.minimizer;
+        do {
+            bucket_out.push_back(m_lookahead);
+            m_in.read(reinterpret_cast<char*>(&m_lookahead), sizeof(minimizer_tuple));
+            if (m_in.gcount() != static_cast<std::streamsize>(sizeof(minimizer_tuple))) {
+                m_eof = true;
+                break;
+            }
+        } while (m_lookahead.minimizer == mm);
+        return mm;
+    }
+
+private:
+    std::ifstream m_in;
+    minimizer_tuple m_lookahead;
+    bool m_eof = true;
+};
+
 /*
     Forward iterator over a per-skew-partition tmp file produced by step
     7.2 phase (B). Each record is `(kmer.bits, uint32_t pos_in_bucket)`.
@@ -104,41 +153,51 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     const uint64_t min_size = 1ULL << constants::min_l;
     const uint64_t num_bits_per_offset = strings_offsets_builder.num_bits_per_offset();
 
-    mm::file_source<minimizer_tuple> input(minimizers.get_minimizers_filename(),
-                                           mm::advice::sequential);
+    const std::string minimizers_filename = minimizers.get_minimizers_filename();
 
     buckets_statistics buckets_stats(num_minimizers, num_kmers, num_minimizer_positions);
 
     uint64_t num_buckets_larger_than_1_not_in_skew_index = 0;
     uint64_t num_buckets_in_skew_index = 0;
-    uint64_t num_super_kmers_in_buckets_larger_than_1 = 0;
     uint64_t num_minimizer_positions_of_buckets_larger_than_1 = 0;
     uint64_t num_minimizer_positions_of_buckets_in_skew_index = 0;
 
     /*
-        First pass: collect bucket statistics to compute tighter bound.
+        Pass 1: streaming statistics over the merged minimizers file. Buckets
+        are accumulated one at a time via std::ifstream-backed reads (no
+        mmap), so RAM usage is bounded by max_bucket_size * sizeof(tuple).
     */
-    for (minimizers_tuples_iterator it(input.data(), input.data() + input.size());  //
-         it.has_next(); it.next())                                                  //
     {
-        auto bucket = it.bucket();
-        const uint64_t bucket_size = bucket.size();
-        buckets_stats.add_bucket_size(bucket_size);
-
-        if (bucket_size > 1) {
-            if (bucket_size <= min_size) {
-                ++num_buckets_larger_than_1_not_in_skew_index;
-                num_minimizer_positions_of_buckets_larger_than_1 += bucket_size;
-            } else {
-                ++num_buckets_in_skew_index;
-                num_minimizer_positions_of_buckets_in_skew_index += bucket_size;
+        streaming_minimizer_bucket_reader reader;
+        reader.open(minimizers_filename);
+        std::vector<minimizer_tuple> bucket_buf;
+        while (reader.has_next_bucket()) {
+            reader.next_bucket(bucket_buf);
+            uint64_t bucket_size = 0;
+            {
+                uint64_t prev = constants::invalid_uint64;
+                for (auto const& mt : bucket_buf) {
+                    if (mt.pos_in_seq != prev) {
+                        ++bucket_size;
+                        prev = mt.pos_in_seq;
+                    }
+                }
+            }
+            buckets_stats.add_bucket_size(bucket_size);
+            if (bucket_size > 1) {
+                if (bucket_size <= min_size) {
+                    ++num_buckets_larger_than_1_not_in_skew_index;
+                    num_minimizer_positions_of_buckets_larger_than_1 += bucket_size;
+                } else {
+                    ++num_buckets_in_skew_index;
+                    num_minimizer_positions_of_buckets_in_skew_index += bucket_size;
+                }
+            }
+            for (auto const& mt : bucket_buf) {
+                buckets_stats.add_num_kmers_in_super_kmer(bucket_size, mt.num_kmers_in_super_kmer);
             }
-            num_super_kmers_in_buckets_larger_than_1 += bucket.num_super_kmers();
-        }
-
-        for (auto mt : bucket) {
-            buckets_stats.add_num_kmers_in_super_kmer(bucket_size, mt.num_kmers_in_super_kmer);
         }
+        reader.close();
     }
 
     assert(buckets_stats.num_buckets() == num_minimizers);
@@ -162,20 +221,17 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         std::cout << "num_bits_for_control = " << num_bits_for_control << std::endl;
     }
 
-    bits::compact_vector::builder control_codewords_builder;
-    control_codewords_builder.resize(num_minimizers, num_bits_for_control);
-
-    strings_offsets_builder.build(d.m_spss.strings_offsets);
-    /* `d.m_spss.strings` is materialized later, in step 8, from the on-disk
-       strings tmp file owned by `strings_builder`. Step 7.2 phase (B) reads
-       directly from the file via a `disk_backed_strings::reader` window. */
-
-    /* step 1. build sparse index */
-    assert(buckets_stats.num_buckets() == num_minimizers);
-
     const uint64_t max_bucket_size = buckets_stats.max_bucket_size();
     const uint64_t log2_max_bucket_size = std::ceil(std::log2(max_bucket_size));
 
+    uint64_t num_partitions = constants::max_l - constants::min_l + 1;
+    if (max_bucket_size < min_size) {
+        num_partitions = 0;
+    } else if (max_bucket_size < (1ULL << constants::max_l)) {
+        num_partitions = log2_max_bucket_size - constants::min_l;
+    }
+    assert(num_partitions <= 8);  // so that we need 3 bits to encode a partition_id
+
     if (build_config.verbose) {
         std::cout << "num_buckets_larger_than_1_not_in_skew_index "
                   << num_buckets_larger_than_1_not_in_skew_index << "/"
@@ -189,55 +245,6 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
                   << std::endl;
         std::cout << "max_bucket_size " << max_bucket_size << std::endl;
         std::cout << "log2_max_bucket_size " << log2_max_bucket_size << std::endl;
-    }
-
-    std::vector<bucket_type> buckets;
-    buckets.reserve(num_buckets_larger_than_1_not_in_skew_index + num_buckets_in_skew_index);
-
-    /* Second pass: register buckets > 1 (pointing directly into the mmap'd
-       `input`, no copy) and handle size-1 buckets inline. */
-    for (minimizers_tuples_iterator it(input.data(), input.data() + input.size());  //
-         it.has_next(); it.next())                                                  //
-    {
-        const uint64_t bucket_id = it.minimizer();
-        auto bucket = it.bucket();
-        const uint64_t bucket_size = bucket.size();
-        if (bucket_size == 1) {
-            // Handle size-1 buckets: encode directly into control codewords
-            uint64_t prev_pos_in_seq = constants::invalid_uint64;
-            for (auto mt : bucket) {
-                if (mt.pos_in_seq != prev_pos_in_seq) {
-                    /*
-                        For minimizers occurring once, store a (log(N)+1)-bit
-                        code, as follows: |offset|0|, i.e., the LSB is 0.
-                    */
-                    uint64_t code = mt.pos_in_seq << 1;  // first LS bit encodes status code: 0
-                    assert(code < (uint64_t(1) << num_bits_for_control));
-                    control_codewords_builder.set(bucket_id, code);
-                    prev_pos_in_seq = mt.pos_in_seq;
-                }
-            }
-        } else {
-            /* Buckets > 1: store pointers directly into the mmap'd `input`.
-               `input` is kept open through step 7.2 phase (A). */
-            buckets.push_back(bucket_type(bucket.begin_ptr(), bucket.end_ptr()));
-        }
-    }
-    assert(buckets.size() ==
-           num_buckets_larger_than_1_not_in_skew_index + num_buckets_in_skew_index);
-
-    std::sort(buckets.begin(), buckets.end(),
-              [](bucket_type const& x, bucket_type const& y) { return x.size() < y.size(); });
-
-    uint64_t num_partitions = constants::max_l - constants::min_l + 1;
-    if (max_bucket_size < min_size) {
-        num_partitions = 0;
-    } else if (max_bucket_size < (1ULL << constants::max_l)) {
-        num_partitions = log2_max_bucket_size - constants::min_l;
-    }
-    assert(num_partitions <= 8);  // so that we need 3 bits to encode a partition_id
-
-    if (build_config.verbose) {
         std::cout << "num_partitions in skew index " << num_partitions << std::endl;
         std::cout << "num_minimizer_positions_of_buckets_larger_than_1 "
                   << num_minimizer_positions_of_buckets_larger_than_1 << "/"
@@ -253,95 +260,193 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
                   << "%)" << std::endl;
     }
 
-    {
-        bits::compact_vector::builder mid_load_buckets_builder;
-        bits::compact_vector::builder heavy_load_buckets_builder;
-        mid_load_buckets_builder.resize(num_minimizer_positions_of_buckets_larger_than_1,
-                                        num_bits_per_offset);
-        heavy_load_buckets_builder.resize(num_minimizer_positions_of_buckets_in_skew_index,
-                                          num_bits_per_offset);
-
-        std::vector<uint32_t> begin_buckets_of_size;
-        begin_buckets_of_size.resize(min_size + 1, 0);
-
-        uint64_t curr_bucket_size = 2;
-        uint64_t list_id = 0;
-        uint64_t mid_load_buckets_size = 0;
-        uint64_t heavy_load_buckets_size = 0;
-
-        uint64_t partition_id = 0;
-        uint64_t lower = min_size;
-        uint64_t upper = 2 * lower;
+    /* Materialize strings_offsets now: needed below to decode pos_in_seq
+       into absolute offsets when emitting heavy-bucket kmer requests.
+       `d.m_spss.strings` is materialized later in step 8 (or stream-saved
+       directly to disk). */
+    strings_offsets_builder.build(d.m_spss.strings_offsets);
 
-        for (auto bucket : buckets) {
-            const uint64_t bucket_size = bucket.size();
-            assert(bucket_size >= 2);
+    /* Precompute the layout of mid_load_buckets from the bucket-size
+       histogram. begin_buckets_of_size[s] is the start offset (in
+       positions, not bits) of size-s bucket positions in mid_load_buckets;
+       it lets us write each bucket's positions in place during the
+       single-pass build, without needing to sort buckets by size. */
+    std::vector<uint32_t> begin_buckets_of_size(min_size + 1, 0);
+    for (uint64_t s = 3; s <= min_size; ++s) {
+        begin_buckets_of_size[s] = static_cast<uint32_t>(  //
+            begin_buckets_of_size[s - 1] +
+            buckets_stats.num_buckets_of_size(s - 1) * (s - 1));
+    }
 
-            if (bucket_size > curr_bucket_size) {
-                while (bucket_size > curr_bucket_size) ++curr_bucket_size;
-                if (curr_bucket_size <= min_size) {
-                    begin_buckets_of_size[curr_bucket_size] = mid_load_buckets_size;
-                } else {
-                    while (curr_bucket_size > upper) {
-                        lower = upper;
-                        upper = 2 * lower;
-                        partition_id += 1;
-                        if (partition_id == num_partitions - 1) upper = max_bucket_size;
+    bits::compact_vector::builder control_codewords_builder;
+    bits::compact_vector::builder mid_load_buckets_builder;
+    bits::compact_vector::builder heavy_load_buckets_builder;
+    control_codewords_builder.resize(num_minimizers, num_bits_for_control);
+    mid_load_buckets_builder.resize(num_minimizer_positions_of_buckets_larger_than_1,
+                                    num_bits_per_offset);
+    heavy_load_buckets_builder.resize(num_minimizer_positions_of_buckets_in_skew_index,
+                                      num_bits_per_offset);
+
+    /* Per-size cursor for mid_load (initialized to begin_buckets_of_size)
+       and per-size list_id counter; monotone cursor for heavy_load. */
+    std::vector<uint64_t> mid_load_cursor(min_size + 1, 0);
+    for (uint64_t s = 2; s <= min_size; ++s) mid_load_cursor[s] = begin_buckets_of_size[s];
+    std::vector<uint64_t> list_id_per_size(min_size + 1, 0);
+    uint64_t heavy_load_cursor = 0;
+
+    /* Per-partition kmer counts; filled during the heavy branch of the
+       combined pass below. */
+    std::vector<uint64_t> num_kmers_in_partition(num_partitions, 0);
+
+    /* Skew-index tmp file naming. */
+    const uint64_t skew_run_id = pthash::clock_type::now().time_since_epoch().count();
+    auto request_run_filename = [&](uint64_t id) {
+        std::stringstream ss;
+        ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id
+           << ".kmer_requests." << id << ".bin";
+        return ss.str();
+    };
+    auto skew_partition_filename = [&](uint64_t pid) {
+        std::stringstream ss;
+        ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id
+           << ".skew_kmers." << pid << ".bin";
+        return ss.str();
+    };
+
+    /* External-sort buffer for kmer-extraction requests (formerly step 7.2
+       phase A; now folded into the combined pass). */
+    std::atomic<uint64_t> num_request_runs{0};
+    const uint64_t request_buffer_capacity = std::max<uint64_t>(
+        uint64_t(1) << 16,
+        (build_config.ram_limit_in_GiB * essentials::GiB) /
+            (4 * sizeof(kmer_extraction_request)));
+    std::vector<kmer_extraction_request> request_buffer;
+    request_buffer.reserve(request_buffer_capacity);
+    auto flush_request_buffer = [&]() {
+        if (request_buffer.empty()) return;
+        parallel_sort(request_buffer, build_config.num_threads,
+                      [](kmer_extraction_request const& a, kmer_extraction_request const& b) {
+                          return a.starting_pos < b.starting_pos;
+                      });
+        const uint64_t id = num_request_runs.fetch_add(1);
+        const std::string fn = request_run_filename(id);
+        if (build_config.verbose) {
+            std::cout << "saving to file '" << fn << "'..." << std::endl;
+        }
+        std::ofstream out(fn, std::ofstream::binary);
+        if (!out.is_open()) throw std::runtime_error("cannot open file");
+        out.write(reinterpret_cast<char const*>(request_buffer.data()),
+                  request_buffer.size() * sizeof(kmer_extraction_request));
+        out.close();
+        request_buffer.clear();
+    };
+
+    /* Map bucket size → partition_id for heavy buckets. num_partitions <= 8
+       so this loop is constant time. */
+    auto partition_for_size = [&](uint64_t bucket_size) -> uint64_t {
+        assert(bucket_size > min_size);
+        uint64_t pid = 0;
+        uint64_t upper = 2 * min_size;
+        while (bucket_size > upper && pid + 1 < num_partitions) {
+            upper *= 2;
+            ++pid;
+        }
+        return pid;
+    };
+
+    /*
+        Combined pass: stream the merged minimizers file once and, per
+        bucket, write the appropriate part of the sparse index. For heavy
+        buckets we also emit kmer-extraction requests in-line (what was
+        formerly step 7.2 phase A). No mmap; no in-RAM `buckets` array.
+    */
+    {
+        streaming_minimizer_bucket_reader reader;
+        reader.open(minimizers_filename);
+        std::vector<minimizer_tuple> bucket_buf;
+        while (reader.has_next_bucket()) {
+            const uint64_t bucket_id = reader.next_bucket(bucket_buf);
+            uint64_t bucket_size = 0;
+            {
+                uint64_t prev = constants::invalid_uint64;
+                for (auto const& mt : bucket_buf) {
+                    if (mt.pos_in_seq != prev) {
+                        ++bucket_size;
+                        prev = mt.pos_in_seq;
                     }
                 }
-                list_id = 0;
             }
 
-            if (curr_bucket_size <= min_size) {
+            if (bucket_size == 1) {
+                /* Singleton: code = |offset|0|, LSB = 0. */
+                const uint64_t code = bucket_buf.front().pos_in_seq << 1;
+                assert(code < (uint64_t(1) << num_bits_for_control));
+                control_codewords_builder.set(bucket_id, code);
+            } else if (bucket_size <= min_size) {
+                /* Mid-load: write positions at the per-size cursor and
+                   assign the next list_id for this size. */
+                const uint64_t list_id = list_id_per_size[bucket_size]++;
+                const uint64_t code =
+                    (((list_id << constants::min_l) | (bucket_size - 2)) << 2) | 1;
+                assert(code < (uint64_t(1) << num_bits_for_control));
+                control_codewords_builder.set(bucket_id, code);
+
+                uint64_t cursor = mid_load_cursor[bucket_size];
                 uint64_t prev_pos_in_seq = constants::invalid_uint64;
-                for (auto mt : bucket) {
-                    if (prev_pos_in_seq == constants::invalid_uint64) {  // only once
-                        uint64_t p = (list_id << constants::min_l) | (curr_bucket_size - 2);
-                        uint64_t code = (p << 2) | 1;  // first two LS bits encode status code: 01
-                        assert(code < (uint64_t(1) << num_bits_for_control));
-                        control_codewords_builder.set(mt.minimizer, code);
-                    }
+                for (auto const& mt : bucket_buf) {
                     if (mt.pos_in_seq != prev_pos_in_seq) {
-                        mid_load_buckets_builder.push_back(mt.pos_in_seq);
+                        mid_load_buckets_builder.set(cursor++, mt.pos_in_seq);
                         prev_pos_in_seq = mt.pos_in_seq;
-                        mid_load_buckets_size += 1;
                     }
                 }
-                ++list_id;
+                mid_load_cursor[bucket_size] = cursor;
             } else {
+                /* Heavy: write positions at the monotone cursor, set the
+                   codeword (encodes the start offset and partition id),
+                   and emit kmer-extraction requests for each super-kmer
+                   in the bucket. */
+                const uint64_t partition_id = partition_for_size(bucket_size);
+                assert(partition_id < num_partitions);
+                const uint64_t bucket_begin = heavy_load_cursor;
+                const uint64_t code = (((bucket_begin << 3) | partition_id) << 2) | 3;
+                assert(code < (uint64_t(1) << num_bits_for_control));
+                control_codewords_builder.set(bucket_id, code);
+
+                uint32_t pos_in_bucket = uint32_t(-1);
                 uint64_t prev_pos_in_seq = constants::invalid_uint64;
-                for (auto mt : bucket) {
-                    if (prev_pos_in_seq == constants::invalid_uint64) {  // only once
-                        assert(partition_id < 8);
-                        uint64_t p = (heavy_load_buckets_size << 3) | partition_id;
-                        uint64_t code = (p << 2) | 3;  // first two LS bits encode status code: 11
-                        assert(code < (uint64_t(1) << num_bits_for_control));
-                        control_codewords_builder.set(mt.minimizer, code);
-                    }
+                for (auto const& mt : bucket_buf) {
+                    num_kmers_in_partition[partition_id] += mt.num_kmers_in_super_kmer;
                     if (mt.pos_in_seq != prev_pos_in_seq) {
-                        heavy_load_buckets_builder.push_back(mt.pos_in_seq);
+                        heavy_load_buckets_builder.set(heavy_load_cursor++, mt.pos_in_seq);
                         prev_pos_in_seq = mt.pos_in_seq;
-                        heavy_load_buckets_size += 1;
+                        ++pos_in_bucket;
                     }
+                    assert(mt.pos_in_seq >= mt.pos_in_kmer);
+                    const uint64_t abs_offset =
+                        d.m_spss.strings_offsets.decode(mt.pos_in_seq).absolute_offset;
+                    const uint64_t starting_pos = abs_offset - mt.pos_in_kmer;
+                    if (request_buffer.size() == request_buffer_capacity) flush_request_buffer();
+                    request_buffer.emplace_back(starting_pos, uint32_t(partition_id),
+                                                pos_in_bucket,
+                                                uint32_t(mt.num_kmers_in_super_kmer));
                 }
             }
         }
-
-        d.m_ssi.begin_buckets_of_size = std::move(begin_buckets_of_size);
-
-        control_codewords_builder.build(d.m_ssi.codewords.control_codewords);
-        mid_load_buckets_builder.build(d.m_ssi.mid_load_buckets);
-        heavy_load_buckets_builder.build(d.m_ssi.ski.heavy_load_buckets);
+        reader.close();
+        flush_request_buffer();
     }
 
-    timer.stop();
+    /* Build sparse-index structures into the dictionary. */
+    d.m_ssi.begin_buckets_of_size = std::move(begin_buckets_of_size);
+    control_codewords_builder.build(d.m_ssi.codewords.control_codewords);
+    mid_load_buckets_builder.build(d.m_ssi.mid_load_buckets);
+    heavy_load_buckets_builder.build(d.m_ssi.ski.heavy_load_buckets);
 
+    timer.stop();
     build_stats.add("step 7.1 (build sparse index)", uint64_t(timer.elapsed()));
-
     if (build_config.verbose) {
         print_time(uint64_t(timer.elapsed()), "step 7.1 (build sparse index)");
     }
-
     timer.reset();
 
     if (num_buckets_in_skew_index == 0) {
@@ -352,119 +457,14 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     /*
         step 2. build skew index
 
-        We do this in three sub-steps:
-        (A) walk the heavy-load buckets in size-sorted order, decode each
-            super-kmer's absolute starting position in `strings` and emit a
-            `kmer_extraction_request`. Requests are sort+flushed to disk in
-            chunks (external sort by `starting_pos`).
-        (B) merge the sorted runs and walk `strings` in a single forward
-            sequential pass, extracting the requested k-mers. For each k-mer
-            we append `(kmer.bits, pos_in_bucket)` to a per-partition tmp file.
-        (C) for each partition, read its tmp file, build the MPHF, then build
-            the positions compact vector. The skew index is assembled
-            partition by partition.
-
-        Avoiding the random access pattern over `strings` in (B) is the
-        precondition for moving `strings` itself out of RAM in a later step.
+        Phases (B) and (C) below; phase (A) was folded into the combined
+        sparse pass above. Phase (B) extracts k-mers from `strings` in a
+        single forward sweep guided by the externally-sorted requests, and
+        phase (C) builds the per-partition MPHF + positions in external
+        memory from the per-partition kmer files.
     */
     timer.start();
 
-    std::vector<uint64_t> num_kmers_in_partition(num_partitions, 0);
-
-    /* unique run identifier for the tmp files produced by this step */
-    const uint64_t skew_run_id = pthash::clock_type::now().time_since_epoch().count();
-    auto request_run_filename = [&](uint64_t id) {
-        std::stringstream ss;
-        ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id
-           << ".kmer_requests." << id << ".bin";
-        return ss.str();
-    };
-    auto skew_partition_filename = [&](uint64_t pid) {
-        std::stringstream ss;
-        ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id
-           << ".skew_kmers." << pid << ".bin";
-        return ss.str();
-    };
-
-    /* (A) emit kmer-extraction requests, externally sorted by `starting_pos` */
-    std::atomic<uint64_t> num_request_runs{0};
-    {
-        const uint64_t request_buffer_capacity = std::max<uint64_t>(
-            uint64_t(1) << 16,
-            (build_config.ram_limit_in_GiB * essentials::GiB) /
-                (4 * sizeof(kmer_extraction_request)));
-
-        std::vector<kmer_extraction_request> request_buffer;
-        request_buffer.reserve(request_buffer_capacity);
-
-        auto flush_request_buffer = [&]() {
-            if (request_buffer.empty()) return;
-            parallel_sort(request_buffer, build_config.num_threads,
-                          [](kmer_extraction_request const& a,
-                             kmer_extraction_request const& b) {
-                              return a.starting_pos < b.starting_pos;
-                          });
-            const uint64_t id = num_request_runs.fetch_add(1);
-            const std::string fn = request_run_filename(id);
-            if (build_config.verbose) {
-                std::cout << "saving to file '" << fn << "'..." << std::endl;
-            }
-            std::ofstream out(fn, std::ofstream::binary);
-            if (!out.is_open()) throw std::runtime_error("cannot open file");
-            out.write(reinterpret_cast<char const*>(request_buffer.data()),
-                      request_buffer.size() * sizeof(kmer_extraction_request));
-            out.close();
-            request_buffer.clear();
-        };
-
-        uint64_t partition_id = 0;
-        uint64_t lower = min_size;
-        uint64_t upper = 2 * lower;
-
-        for (uint64_t i = buckets.size() - num_buckets_in_skew_index; i < buckets.size(); ++i)  //
-        {
-            auto const& bucket = buckets[i];
-            const uint64_t bucket_size = bucket.size();
-            while (bucket_size > upper)  //
-            {
-                lower = upper;
-                upper = 2 * lower;
-                partition_id += 1;
-                if (partition_id == num_partitions - 1) upper = max_bucket_size;
-            }
-            assert(bucket_size > lower and bucket_size <= upper);
-            assert(partition_id < num_partitions);
-
-            uint32_t pos_in_bucket = uint32_t(-1);
-            uint64_t prev_pos_in_seq = constants::invalid_uint64;
-            for (auto mt : bucket)  //
-            {
-                num_kmers_in_partition[partition_id] += mt.num_kmers_in_super_kmer;
-                if (mt.pos_in_seq != prev_pos_in_seq) {
-                    prev_pos_in_seq = mt.pos_in_seq;
-                    ++pos_in_bucket;
-                }
-                assert(mt.pos_in_seq >= mt.pos_in_kmer);
-                const uint64_t abs_offset =
-                    d.m_spss.strings_offsets.decode(mt.pos_in_seq).absolute_offset;
-                const uint64_t starting_pos = abs_offset - mt.pos_in_kmer;
-                if (request_buffer.size() == request_buffer_capacity) flush_request_buffer();
-                request_buffer.emplace_back(starting_pos,                          //
-                                            uint32_t(partition_id),                //
-                                            pos_in_bucket,                         //
-                                            uint32_t(mt.num_kmers_in_super_kmer)); //
-            }
-        }
-        flush_request_buffer();
-        assert(partition_id == num_partitions - 1);
-    }
-
-    /* `buckets` and the mmap'd `input` are no longer needed: phase (B) walks
-       the sorted requests and per-partition tmp files, phase (C) walks the
-       per-partition tmp files. Free both now to bound RAM. */
-    std::vector<bucket_type>().swap(buckets);
-    input.close();
-
     if (build_config.verbose) {
         uint64_t total_kmers_in_skew = 0;
         for (uint64_t p = 0; p != num_partitions; ++p) {

From a6ac614fd5288ecf276aecc30a55e03a3119a550 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 5 May 2026 06:29:36 +0000
Subject: [PATCH 09/32] remove all remaining mmap from the SSHash build path

This commit eliminates the last three mm::file_source usages
in the build, so the only mmap left in step 7 is inside
pthash's external-memory PHF builder (which manages its own
RAM via config.ram).

(1) file_merging_iterator: each input run is now read with a
    bounded buffered std::ifstream (default 4096 records per
    stream) instead of mm::file_source. The winner-tree merge
    logic is unchanged; comparisons just use the in-RAM
    buffer's current value rather than a pointer into mmap'd
    memory. RSS for the merge is now bounded by
    `num_runs * buffer_records * sizeof(T)` regardless of run
    sizes.

(2) minimizers_tuples::merge: the post-rename single-file
    count path used to mmap the merged file via
    mm::file_source and walk it with minimizers_tuples_iterator.
    It now uses streaming_minimizer_bucket_reader (hoisted
    from build_sparse_and_skew_index.cpp into util.hpp) for a
    pure ifstream pass.

(3) dictionary_builder::build_mphf: replaces mm::file_source
    + minimizers_tuples_iterator with a new
    streaming_minimizers_iterator that yields each distinct
    minimizer once via std::ifstream. The iterator is
    copyable (shared_ptr<ifstream>), as required by pthash's
    by-value `build_in_external_memory` signature.

Verified byte-identical output vs the previous commit on
salmonella_enterica m=7, plus full --check on regular,
--canonical, multi-thread (-t 4), and --weighted builds, plus
a streaming-save round-trip (sshash check + sshash query)
returning "EVERYTHING OK!" on all five suites and 100%
positive matches.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 include/builder/dictionary_builder.hpp      |  10 +-
 include/builder/file_merging_iterator.hpp   | 183 +++++++++++++-------
 include/builder/util.hpp                    | 153 ++++++++++++++--
 src/builder/build_sparse_and_skew_index.cpp |  49 ------
 4 files changed, 272 insertions(+), 123 deletions(-)

diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp
index 3c4c5fe..29ad04e 100644
--- a/include/builder/dictionary_builder.hpp
+++ b/include/builder/dictionary_builder.hpp
@@ -169,11 +169,13 @@ struct dictionary_builder  //
 
     void build_mphf(dictionary<Kmer, Offsets>& d) {
         const uint64_t num_minimizers = minimizers.num_minimizers();
-        mm::file_source<minimizer_tuple> input(minimizers.get_minimizers_filename(),
-                                               mm::advice::sequential);
-        minimizers_tuples_iterator iterator(input.data(), input.data() + input.size());
+        /* Stream minimizers from disk via std::ifstream (no mmap); the
+           iterator yields each distinct minimizer once, matching what
+           `minimizers_tuples_iterator` did over the mmap'd file. */
+        streaming_minimizers_iterator iterator;
+        iterator.open(minimizers.get_minimizers_filename());
         d.m_ssi.codewords.build(iterator, num_minimizers, build_config);
-        input.close();
+        iterator.close();
         assert(d.m_ssi.codewords.size() == num_minimizers);
     }
 
diff --git a/include/builder/file_merging_iterator.hpp b/include/builder/file_merging_iterator.hpp
index 85b95ac..ff191ee 100644
--- a/include/builder/file_merging_iterator.hpp
+++ b/include/builder/file_merging_iterator.hpp
@@ -1,42 +1,59 @@
 #pragma once
 
-#include <vector>
-#include <queue>
-#include <cstdint>
 #include <cassert>
+#include <cstdint>
+#include <fstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
 
 #include "util.hpp"
 
 namespace sshash {
 
 /*
-    Winner-tree-based implementation.
+    Winner-tree-based external-merge iterator over N sorted runs on disk.
+
+    Each run is read with a small buffered std::ifstream (no mmap) so that
+    process RSS stays bounded by `num_files_to_merge * buffer_records *
+    sizeof(T)` regardless of total run size. Values are surfaced as
+    `T const&` from each stream's in-RAM buffer; the merge logic compares
+    those values directly instead of pointers into mmap'd memory.
+
+    Required of T:
+      - copy-constructible / move-constructible,
+      - `static T T::max()` returning a strict upper bound (used as the
+        sentinel for exhausted streams in the winner tree),
+      - `bool operator<(T, T)`.
 */
 template <typename T>
 struct file_merging_iterator  //
 {
+    static constexpr uint64_t default_buffer_records = uint64_t(1) << 12;  // 4096 records
     const uint64_t scan_threshold = 16;
 
     template <typename FileNamesIterator>
-    file_merging_iterator(FileNamesIterator file_names_iterator, uint64_t num_files_to_merge)
-        : m_mm_files(num_files_to_merge)  //
+    file_merging_iterator(FileNamesIterator file_names_iterator, uint64_t num_files_to_merge,
+                          uint64_t buffer_records = default_buffer_records)  //
     {
-        if (num_files_to_merge == 0) return;
+        if (num_files_to_merge == 0) {
+            m_num_files_to_merge = 0;
+            return;
+        }
 
-        /* open files and create the input iterators */
-        m_iterators.reserve(num_files_to_merge);
+        m_streams.reserve(num_files_to_merge);
         for (uint64_t i = 0; i != num_files_to_merge; ++i, ++file_names_iterator) {
-            m_mm_files[i].open(*file_names_iterator, mm::advice::sequential);
-            m_iterators.push_back(
-                {m_mm_files[i].data(), m_mm_files[i].data() + m_mm_files[i].size()});
+            m_streams.emplace_back();
+            m_streams.back().open(*file_names_iterator, buffer_records);
         }
 
         m_num_files_to_merge = num_files_to_merge;
         m_min_idx = 0;
-        if (m_iterators.size() <= scan_threshold) {
+        if (m_streams.size() <= scan_threshold) {
             compute_min();
         } else {
-            /* build a winner tree */
+            /* build a winner tree (same shape as before, but the leaves
+               index into m_streams instead of carrying raw pointers). */
             uint64_t n = num_files_to_merge;
             uint64_t m = 2 * n - 1;
             m_size = n;
@@ -51,97 +68,143 @@ struct file_merging_iterator  //
 
     bool has_next() { return m_num_files_to_merge != 0; }
     void next() { update(); }
-    T operator*() const { return *(m_iterators[m_min_idx].begin); }
+    T operator*() const { return m_streams[m_min_idx].current(); }
 
     void close() {
-        for (auto& mm_file : m_mm_files) mm_file.close();
-        m_iterators.clear();
-        m_mm_files.clear();
+        for (auto& s : m_streams) s.close();
+        m_streams.clear();
+        m_streams.shrink_to_fit();
         m_tree.clear();
+        m_tree.shrink_to_fit();
     }
 
 private:
-    struct pointer_pair {
-        T const* begin;
-        T const* end;
+    /*
+        A buffered, forward-only reader over a single run file. Reads in
+        chunks of `m_buf.size()` records via std::ifstream and presents a
+        T-by-reference current-value interface.
+    */
+    struct buffered_stream {
+        buffered_stream() = default;
+        buffered_stream(buffered_stream const&) = delete;
+        buffered_stream& operator=(buffered_stream const&) = delete;
+        buffered_stream(buffered_stream&&) = default;
+        buffered_stream& operator=(buffered_stream&&) = default;
+
+        void open(std::string const& filename, uint64_t buffer_records) {
+            m_buf.resize(std::max<uint64_t>(1, buffer_records));
+            m_in.open(filename, std::ifstream::binary);
+            if (!m_in.is_open()) {
+                throw std::runtime_error("cannot open run file '" + filename + "'");
+            }
+            m_pos = 0;
+            m_size = 0;
+            m_eof = false;
+            refill();
+        }
+
+        void close() {
+            if (m_in.is_open()) m_in.close();
+            m_buf.clear();
+            m_buf.shrink_to_fit();
+            m_pos = 0;
+            m_size = 0;
+            m_eof = true;
+        }
+
+        bool empty() const { return m_pos >= m_size; }
+
+        T const& current() const {
+            assert(!empty());
+            return m_buf[m_pos];
+        }
+
+        void advance() {
+            assert(!empty());
+            ++m_pos;
+            if (m_pos >= m_size && !m_eof) refill();
+        }
+
+    private:
+        std::ifstream m_in;
+        std::vector<T> m_buf;
+        uint64_t m_pos = 0;
+        uint64_t m_size = 0;
+        bool m_eof = true;
+
+        void refill() {
+            m_pos = 0;
+            m_in.read(reinterpret_cast<char*>(m_buf.data()),
+                      static_cast<std::streamsize>(m_buf.size() * sizeof(T)));
+            const std::streamsize got = m_in.gcount();
+            m_size = static_cast<uint64_t>(got) / sizeof(T);
+            if (m_size == 0) m_eof = true;
+        }
     };
-    std::vector<pointer_pair> m_iterators;
-    std::vector<mm::file_source<T const>> m_mm_files;
+
+    std::vector<buffered_stream> m_streams;
     std::vector<uint32_t> m_tree;
 
-    uint64_t m_begin, m_size;
-    uint64_t m_min_idx, m_num_files_to_merge;
+    uint64_t m_begin = 0, m_size = 0;
+    uint64_t m_min_idx = 0, m_num_files_to_merge = 0;
 
     void update() {
-        if (m_iterators.size() <= scan_threshold) {  // compute min with a linear scan
-            auto& it = m_iterators[m_min_idx];
-            it.begin += 1;
-            if (it.begin == it.end) {
-                m_iterators.erase(m_iterators.begin() + m_min_idx);
+        if (m_streams.size() <= scan_threshold) {
+            auto& s = m_streams[m_min_idx];
+            s.advance();
+            if (s.empty()) {
+                m_streams.erase(m_streams.begin() + m_min_idx);
                 m_min_idx = 0;
                 --m_num_files_to_merge;
                 if (m_num_files_to_merge == 0) return;
             }
             compute_min();
-        } else {  // update the winner tree
+        } else {  // winner tree
             m_min_idx = m_tree[0];
-            assert(m_min_idx < m_iterators.size());
-            auto& it = m_iterators[m_min_idx];
-            it.begin += 1;
+            assert(m_min_idx < m_streams.size());
+            auto& s = m_streams[m_min_idx];
+            s.advance();
             uint64_t p = m_begin + m_min_idx;
-            p -= (p >= m_tree.size()) * m_size;  // p is the index of the leaf
-            if (it.begin == it.end) {
+            p -= (p >= m_tree.size()) * m_size;  // p is the leaf index
+            if (s.empty()) {
                 m_tree[p] = uint32_t(-1);
                 --m_num_files_to_merge;
             }
             const T inf = T::max();
             while (p) {
                 uint64_t is_r_child = (p & 1) == 0;
-                uint32_t i = 0;
                 uint32_t l = m_tree[p - is_r_child];
                 uint32_t r = m_tree[p + 1 - is_r_child];
-
-                T const* ptr_l = (l == uint32_t(-1)) ? &inf : m_iterators[l].begin;
-                T const* ptr_r = (r == uint32_t(-1)) ? &inf : m_iterators[r].begin;
-                i = (*ptr_l < *ptr_r) ? l : r;
-
-                /* same as this code but the one above uses CMOV */
-                // if (l == uint32_t(-1)) {
-                //     i = r;
-                // } else if (r == uint32_t(-1)) {
-                //     i = l;
-                // } else {
-                //     i = *(m_iterators[l].begin) < *(m_iterators[r].begin) ? l : r;
-                // }
-
+                T const& vl = (l == uint32_t(-1)) ? inf : m_streams[l].current();
+                T const& vr = (r == uint32_t(-1)) ? inf : m_streams[r].current();
+                uint32_t i = (vl < vr) ? l : r;
                 uint64_t parent = (p - 1) / 2;
                 m_tree[parent] = i;
                 p = parent;
             }
             m_min_idx = m_tree[0];
         }
-    };
+    }
 
     uint32_t build(uint32_t p) {
         if (p >= m_tree.size()) return uint32_t(-1);
         if (p >= m_size - 1) return m_tree[p];  // leaf
         uint32_t l = build(2 * p + 1);
         uint32_t r = build(2 * p + 2);
-        uint32_t i = 0;
         const T inf = T::max();
-        T const* ptr_l = (l == uint32_t(-1)) ? &inf : m_iterators[l].begin;
-        T const* ptr_r = (r == uint32_t(-1)) ? &inf : m_iterators[r].begin;
-        i = (*ptr_l < *ptr_r) ? l : r;
+        T const& vl = (l == uint32_t(-1)) ? inf : m_streams[l].current();
+        T const& vr = (r == uint32_t(-1)) ? inf : m_streams[r].current();
+        uint32_t i = (vl < vr) ? l : r;
         m_tree[p] = i;
         return i;
     }
 
     void compute_min() {
         m_min_idx = 0;
-        auto min_val = *m_iterators.front().begin;
-        for (uint64_t i = 1; i != m_iterators.size(); ++i) {
-            assert(m_iterators[i].begin != m_iterators[i].end);
-            auto val = *m_iterators[i].begin;
+        T min_val = m_streams.front().current();
+        for (uint64_t i = 1; i != m_streams.size(); ++i) {
+            assert(!m_streams[i].empty());
+            T const& val = m_streams[i].current();
             if (val < min_val) {
                 min_val = val;
                 m_min_idx = i;
diff --git a/include/builder/util.hpp b/include/builder/util.hpp
index 9b9b209..bd57038 100644
--- a/include/builder/util.hpp
+++ b/include/builder/util.hpp
@@ -1,7 +1,9 @@
 #pragma once
 
-#include <vector>
 #include <atomic>
+#include <fstream>
+#include <memory>
+#include <vector>
 
 #include "file_merging_iterator.hpp"
 #include "parallel_sort.hpp"
@@ -153,6 +155,126 @@ struct minimizers_tuples_iterator {
     }
 };
 
+/*
+    Streaming forward iterator over a sorted minimizers tmp file that
+    yields each distinct `minimizer` value exactly once (i.e., one value
+    per bucket). Equivalent to `minimizers_tuples_iterator` over an mmap'd
+    buffer, but reads from std::ifstream so RAM usage is constant.
+
+    Copyable: pthash's `build_in_external_memory` takes the iterator by
+    value, so the underlying ifstream is held via shared_ptr. Copies share
+    the stream state; pthash's local copy advances the shared stream, and
+    the original at the call site is unused after the build returns.
+*/
+struct streaming_minimizers_iterator {
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = uint64_t;
+    using difference_type = std::ptrdiff_t;
+    using reference = uint64_t const&;
+    using pointer = uint64_t const*;
+
+    streaming_minimizers_iterator() = default;
+
+    void open(std::string const& filename) {
+        m_in = std::make_shared<std::ifstream>(filename, std::ifstream::binary);
+        if (!m_in->is_open()) {
+            throw std::runtime_error("cannot open minimizers tmp file '" + filename + "'");
+        }
+        m_eof = false;
+        m_current = uint64_t(-1);
+        // Bootstrap: read the first tuple.
+        minimizer_tuple t;
+        m_in->read(reinterpret_cast<char*>(&t), sizeof(minimizer_tuple));
+        if (m_in->gcount() != static_cast<std::streamsize>(sizeof(minimizer_tuple))) {
+            m_eof = true;
+            return;
+        }
+        m_current = t.minimizer;
+    }
+
+    void close() {
+        if (m_in && m_in->is_open()) m_in->close();
+        m_in.reset();
+    }
+
+    uint64_t operator*() const { return m_current; }
+    streaming_minimizers_iterator& operator++() {
+        advance_to_next_minimizer();
+        return *this;
+    }
+
+private:
+    std::shared_ptr<std::ifstream> m_in;
+    uint64_t m_current = uint64_t(-1);
+    bool m_eof = true;
+
+    void advance_to_next_minimizer() {
+        const uint64_t prev = m_current;
+        minimizer_tuple t;
+        while (true) {
+            m_in->read(reinterpret_cast<char*>(&t), sizeof(minimizer_tuple));
+            if (m_in->gcount() != static_cast<std::streamsize>(sizeof(minimizer_tuple))) {
+                m_eof = true;
+                return;  // m_current holds last value; pthash has consumed `num_minimizers` keys
+            }
+            if (t.minimizer != prev) {
+                m_current = t.minimizer;
+                return;
+            }
+        }
+    }
+};
+
+/*
+    Streaming reader over a minimizers tmp file. Reads minimizer_tuple
+    records via std::ifstream (no mmap), and groups consecutive tuples by
+    minimizer into "buckets" — exactly as `minimizers_tuples_iterator` does
+    over an mmap'd buffer, but with bounded RAM (~ one bucket at a time
+    plus one record of lookahead).
+
+    The caller passes a vector to receive the bucket's tuples; for typical
+    inputs this peaks at max_bucket_size * sizeof(minimizer_tuple).
+*/
+struct streaming_minimizer_bucket_reader {
+    void open(std::string const& filename) {
+        m_in.open(filename, std::ifstream::binary);
+        if (!m_in.is_open()) {
+            throw std::runtime_error("cannot open minimizers tmp file '" + filename + "'");
+        }
+        // Read first record into the lookahead slot, if any.
+        m_in.read(reinterpret_cast<char*>(&m_lookahead), sizeof(minimizer_tuple));
+        m_eof = (m_in.gcount() != static_cast<std::streamsize>(sizeof(minimizer_tuple)));
+    }
+
+    void close() {
+        if (m_in.is_open()) m_in.close();
+    }
+
+    bool has_next_bucket() const { return !m_eof; }
+
+    /* Read the next bucket into `bucket_out` (cleared first). All tuples in
+       a bucket share the same minimizer. Returns the bucket's minimizer. */
+    uint64_t next_bucket(std::vector<minimizer_tuple>& bucket_out) {
+        bucket_out.clear();
+        assert(!m_eof);
+        const uint64_t mm = m_lookahead.minimizer;
+        do {
+            bucket_out.push_back(m_lookahead);
+            m_in.read(reinterpret_cast<char*>(&m_lookahead), sizeof(minimizer_tuple));
+            if (m_in.gcount() != static_cast<std::streamsize>(sizeof(minimizer_tuple))) {
+                m_eof = true;
+                break;
+            }
+        } while (m_lookahead.minimizer == mm);
+        return mm;
+    }
+
+private:
+    std::ifstream m_in;
+    minimizer_tuple m_lookahead;
+    bool m_eof = true;
+};
+
 struct minimizers_tuples {
     minimizers_tuples() {}
     minimizers_tuples(build_configuration const& build_config)
@@ -217,17 +339,28 @@ struct minimizers_tuples {
             assert(m_num_minimizers == 0);
             assert(m_num_minimizer_positions == 0);
             assert(m_num_super_kmers == 0);
-            mm::file_source<minimizer_tuple> input(get_minimizers_filename(),
-                                                   mm::advice::sequential);
-            for (minimizers_tuples_iterator it(input.data(), input.data() + input.size());
-                 it.has_next(); it.next())  //
-            {
-                auto bucket = it.bucket();
+
+            /* Single-pass count via streaming ifstream (no mmap). */
+            streaming_minimizer_bucket_reader reader;
+            reader.open(get_minimizers_filename());
+            std::vector<minimizer_tuple> bucket_buf;
+            while (reader.has_next_bucket()) {
+                reader.next_bucket(bucket_buf);
+                uint64_t bucket_size = 0;
+                {
+                    uint64_t prev = constants::invalid_uint64;
+                    for (auto const& mt : bucket_buf) {
+                        if (mt.pos_in_seq != prev) {
+                            ++bucket_size;
+                            prev = mt.pos_in_seq;
+                        }
+                    }
+                }
                 m_num_minimizers += 1;
-                m_num_minimizer_positions += bucket.size();
-                m_num_super_kmers += bucket.num_super_kmers();
+                m_num_minimizer_positions += bucket_size;
+                m_num_super_kmers += bucket_buf.size();
             }
-            input.close();
+            reader.close();
             return;
         }
 
diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp
index a45df4e..de7c963 100644
--- a/src/builder/build_sparse_and_skew_index.cpp
+++ b/src/builder/build_sparse_and_skew_index.cpp
@@ -36,55 +36,6 @@ struct kmer_extraction_request {
 };
 #pragma pack(pop)
 
-/*
-    Streaming reader over the merged minimizers file. Reads minimizer_tuple
-    records via std::ifstream (no mmap), and groups consecutive tuples by
-    minimizer into "buckets" — exactly as `minimizers_tuples_iterator` does
-    over an mmap'd buffer, but with bounded RAM (~ one bucket at a time).
-
-    The caller passes a vector to receive the bucket's tuples; for typical
-    inputs this peaks at max_bucket_size * sizeof(minimizer_tuple).
-*/
-struct streaming_minimizer_bucket_reader {
-    void open(std::string const& filename) {
-        m_in.open(filename, std::ifstream::binary);
-        if (!m_in.is_open()) {
-            throw std::runtime_error("cannot open minimizers tmp file '" + filename + "'");
-        }
-        // Read first record into the lookahead slot, if any.
-        m_in.read(reinterpret_cast<char*>(&m_lookahead), sizeof(minimizer_tuple));
-        m_eof = (m_in.gcount() != static_cast<std::streamsize>(sizeof(minimizer_tuple)));
-    }
-
-    void close() {
-        if (m_in.is_open()) m_in.close();
-    }
-
-    bool has_next_bucket() const { return !m_eof; }
-
-    /* Read the next bucket into `bucket_out` (cleared first). All tuples in
-       a bucket share the same minimizer. Returns the bucket's minimizer. */
-    uint64_t next_bucket(std::vector<minimizer_tuple>& bucket_out) {
-        bucket_out.clear();
-        assert(!m_eof);
-        const uint64_t mm = m_lookahead.minimizer;
-        do {
-            bucket_out.push_back(m_lookahead);
-            m_in.read(reinterpret_cast<char*>(&m_lookahead), sizeof(minimizer_tuple));
-            if (m_in.gcount() != static_cast<std::streamsize>(sizeof(minimizer_tuple))) {
-                m_eof = true;
-                break;
-            }
-        } while (m_lookahead.minimizer == mm);
-        return mm;
-    }
-
-private:
-    std::ifstream m_in;
-    minimizer_tuple m_lookahead;
-    bool m_eof = true;
-};
-
 /*
     Forward iterator over a per-skew-partition tmp file produced by step
     7.2 phase (B). Each record is `(kmer.bits, uint32_t pos_in_bucket)`.

From fe44326848e66b9998721ca5518ba7c1fd69809d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 5 May 2026 06:43:03 +0000
Subject: [PATCH 10/32] strings_offsets: stream to disk during build

The strings_offsets_builder used to be an in-RAM
std::vector<uint64_t> of size num_strings + 1, finalized into
d.m_spss.strings_offsets at step 7.1. For inputs with many
strings this is non-trivial RSS (8 B per string), held
through steps 1-7.

This commit replaces it with a disk-backed
disk_backed_offsets_builder<Offsets>:

  - push_back() during step 1 spills to a tmp file under a
    small in-RAM write buffer (~32 KiB),
  - compute_minimizer_tuples (step 2) opens one
    `reader` per thread, positioned at the thread's
    index_begin, and walks forward sequentially via a
    bounded read buffer (~32 KiB / thread). The per-thread
    `[i]`, `[i+1]` access pattern is replaced with a single
    rolling `prev_offset = next()` cursor.
  - encode(offset, begin, string_id) is now a pure const
    method on the disk-backed builder (depends only on
    m_nb and m_size), preserving multi-threaded safety.
  - build(target) at step 7.1 streams the file's contents
    via a copyable forward iterator into target.m_seq's
    encode/build, so neither side materializes the offsets
    in RAM. The on-disk file is removed after build.

To keep `target.m_seq` accessible from the external builder
without exposing it broadly, `offsets<Seq>` befriends
`disk_backed_offsets_builder<...>` via a templated friend
declaration; the concrete decoded_offsets / encoded_offsets
types inherit that friendship.

Verified byte-identical output vs the previous commit on
salmonella_enterica m=7, plus full --check on regular,
--canonical, multi-thread (-t 4), and --weighted, and a
streaming-save round-trip (sshash check + sshash query) with
all five "EVERYTHING OK!" suites and 100% positive matches.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 include/builder/dictionary_builder.hpp        |  21 +-
 .../builder/disk_backed_offsets_builder.hpp   | 327 ++++++++++++++++++
 include/offsets.hpp                           |   7 +
 src/builder/compute_minimizer_tuples.cpp      |   9 +-
 4 files changed, 356 insertions(+), 8 deletions(-)
 create mode 100644 include/builder/disk_backed_offsets_builder.hpp

diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp
index 29ad04e..6d30fb5 100644
--- a/include/builder/dictionary_builder.hpp
+++ b/include/builder/dictionary_builder.hpp
@@ -5,6 +5,7 @@
 #include "include/offsets.hpp"
 #include "include/builder/util.hpp"
 #include "include/builder/disk_backed_strings.hpp"
+#include "include/builder/disk_backed_offsets_builder.hpp"
 #include "include/builder/streaming_save.hpp"
 #include "include/buckets_statistics.hpp"
 
@@ -20,7 +21,10 @@ struct dictionary_builder  //
         , strings_run_id(pthash::clock_type::now().time_since_epoch().count())
         , total_time_musec(0) {}
 
-    ~dictionary_builder() { strings_builder.remove_file(); }
+    ~dictionary_builder() {
+        strings_builder.remove_file();
+        strings_offsets_builder.remove_file();
+    }
 
     /*
         Build a query-ready dictionary in `d`. After this returns,
@@ -59,7 +63,7 @@ struct dictionary_builder  //
     build_configuration build_config;
     uint64_t num_kmers;
     minimizers_tuples minimizers;
-    typename Offsets::builder strings_offsets_builder;
+    disk_backed_offsets_builder<Offsets> strings_offsets_builder;
     disk_backed_strings strings_builder;
     weights::builder weights_builder;
 
@@ -88,15 +92,20 @@ struct dictionary_builder  //
         total_time_musec = 0;
 
         {
-            std::stringstream ss;
-            ss << build_config.tmp_dirname << "/sshash.tmp.run_" << strings_run_id
-               << ".strings.bin";
-            strings_builder.open_for_writing(ss.str());
+            std::stringstream ss_strings;
+            ss_strings << build_config.tmp_dirname << "/sshash.tmp.run_" << strings_run_id
+                       << ".strings.bin";
+            strings_builder.open_for_writing(ss_strings.str());
+            std::stringstream ss_offsets;
+            ss_offsets << build_config.tmp_dirname << "/sshash.tmp.run_" << strings_run_id
+                       << ".strings_offsets.bin";
+            strings_offsets_builder.open_for_writing(ss_offsets.str());
         }
 
         do_step("step 1 (encode strings)", [&]() {
             encode_strings(filename);
             strings_builder.freeze();
+            strings_offsets_builder.freeze();
             d.m_num_kmers = num_kmers;
             assert(strings_offsets_builder.size() >= 2);
             d.m_num_strings = strings_offsets_builder.size() - 1;
diff --git a/include/builder/disk_backed_offsets_builder.hpp b/include/builder/disk_backed_offsets_builder.hpp
new file mode 100644
index 0000000..c86d33f
--- /dev/null
+++ b/include/builder/disk_backed_offsets_builder.hpp
@@ -0,0 +1,327 @@
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <fstream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "include/offsets.hpp"
+
+namespace sshash {
+
+/*
+    A disk-backed drop-in for `Offsets::builder` that spills offset values
+    to a tmp file as they're appended, keeping only a small in-RAM write
+    buffer. RAM usage of the builder is bounded by the buffer size,
+    independently of the number of strings.
+
+    Interface mirrors `decoded_offsets::builder` / `encoded_offsets::builder`
+    enough for the SSHash build path:
+
+        reserve(n)                      no-op (kept for source compat)
+        push_back(val)                  append to file (buffered)
+        front() / back() / size()       O(1), tracked separately
+        num_bytes()                     in-RAM footprint of the builder
+        set_num_bits(nb)                stash bit-width metadata
+        num_bits_per_offset()           dispatched per Offsets type
+        encode(offset, begin, sid)      pure, dispatched per Offsets type
+        build(target)                   stream from disk into the final
+                                        compact / endpoints sequence
+        remove_file()                   cleanup
+
+    Random-access `operator[]` is *not* supported. Callers that need to
+    walk a contiguous range of offsets must use a `reader`, which provides
+    forward-sequential reads via a small in-RAM buffer.
+*/
+template <typename Offsets>
+struct disk_backed_offsets_builder {
+    static_assert(std::is_same_v<Offsets, decoded_offsets> ||
+                      std::is_same_v<Offsets, encoded_offsets>,
+                  "disk_backed_offsets_builder supports decoded_offsets and encoded_offsets");
+
+    static constexpr uint64_t default_writer_buffer_records = uint64_t(1) << 12;  // 32 KiB
+    static constexpr uint64_t default_reader_buffer_records = uint64_t(1) << 12;  // 32 KiB
+
+    disk_backed_offsets_builder() = default;
+    disk_backed_offsets_builder(disk_backed_offsets_builder const&) = delete;
+    disk_backed_offsets_builder& operator=(disk_backed_offsets_builder const&) = delete;
+
+    void open_for_writing(std::string const& filename,
+                          uint64_t writer_buffer_records = default_writer_buffer_records) {
+        m_filename = filename;
+        m_writer_buffer_capacity = std::max<uint64_t>(1, writer_buffer_records);
+        m_buf.clear();
+        m_buf.reserve(m_writer_buffer_capacity);
+        m_size = 0;
+        m_front = 0;
+        m_back = 0;
+        m_have_front = false;
+        m_frozen = false;
+        m_writer.open(m_filename, std::ofstream::binary | std::ofstream::trunc);
+        if (!m_writer.is_open()) {
+            throw std::runtime_error("cannot open offsets tmp file '" + m_filename + "'");
+        }
+    }
+
+    /* No-op: kept for source-compatibility with the in-RAM builder. */
+    void reserve(uint64_t /*n*/) {}
+
+    void push_back(uint64_t val) {
+        if (!m_have_front) {
+            m_front = val;
+            m_have_front = true;
+        }
+        m_back = val;
+        m_buf.push_back(val);
+        ++m_size;
+        if (m_buf.size() >= m_writer_buffer_capacity) flush_buffer();
+    }
+
+    /* Finish writing: flush the in-RAM buffer and close the writer. */
+    void freeze() {
+        if (m_frozen) return;
+        flush_buffer();
+        if (m_writer.is_open()) m_writer.close();
+        m_frozen = true;
+    }
+
+    uint64_t size() const { return m_size; }
+    uint64_t front() const { return m_front; }
+    uint64_t back() const { return m_back; }
+    std::string const& filename() const { return m_filename; }
+
+    /* In-RAM footprint of the builder (excluding the on-disk file). */
+    uint64_t num_bytes() const {
+        return sizeof(m_nb) + m_buf.capacity() * sizeof(uint64_t);
+    }
+
+    void set_num_bits(num_bits nb) { m_nb = nb; }
+
+    uint64_t num_bits_per_offset() const {
+        if constexpr (std::is_same_v<Offsets, decoded_offsets>) {
+            return m_nb.per_absolute_offset;
+        } else {
+            return m_nb.per_string_id + m_nb.per_relative_offset;
+        }
+    }
+
+    /* Pure: matches `decoded_offsets::builder::encode` /
+       `encoded_offsets::builder::encode`. Safe to call concurrently from
+       multiple threads (depends only on m_nb and m_size, both of which
+       are stable while compute_minimizer_tuples runs). */
+    uint64_t encode(uint64_t offset, uint64_t begin, uint64_t string_id) const {
+        if constexpr (std::is_same_v<Offsets, decoded_offsets>) {
+            (void)begin;
+            (void)string_id;
+            return offset;
+        } else {
+            assert(string_id < m_size);
+            assert(offset >= begin);
+            assert((offset - begin) < (uint64_t(1) << m_nb.per_relative_offset));
+            uint64_t relative_offset = offset - begin;
+            return (string_id << m_nb.per_relative_offset) + relative_offset;
+        }
+    }
+
+    /*
+        Forward-sequential reader over the offsets file. Each thread in
+        compute_minimizer_tuples should construct one for its assigned
+        index range; per-thread RAM footprint is the buffer size only.
+    */
+    struct reader {
+        reader() = default;
+        reader(reader const&) = delete;
+        reader& operator=(reader const&) = delete;
+        reader(reader&&) = default;
+        reader& operator=(reader&&) = default;
+
+        /* Open the file and seek so that the next `next()` call returns
+           `*(values + start_index)`. */
+        void open(std::string const& filename, uint64_t start_index,
+                  uint64_t buffer_records = default_reader_buffer_records) {
+            m_buf.assign(std::max<uint64_t>(1, buffer_records), 0);
+            m_pos = 0;
+            m_size = 0;
+            m_in.open(filename, std::ifstream::binary);
+            if (!m_in.is_open()) {
+                throw std::runtime_error("cannot open offsets tmp file '" + filename + "'");
+            }
+            m_in.seekg(static_cast<std::streamoff>(start_index * sizeof(uint64_t)),
+                       std::ios::beg);
+            refill();
+        }
+
+        void close() {
+            if (m_in.is_open()) m_in.close();
+            m_buf.clear();
+            m_buf.shrink_to_fit();
+            m_pos = 0;
+            m_size = 0;
+        }
+
+        /* Return the next offset and advance. Caller must ensure they
+           don't read past the end of the file. */
+        uint64_t next() {
+            if (m_pos >= m_size) refill();
+            assert(m_pos < m_size);
+            return m_buf[m_pos++];
+        }
+
+    private:
+        std::ifstream m_in;
+        std::vector<uint64_t> m_buf;
+        uint64_t m_pos = 0;
+        uint64_t m_size = 0;
+
+        void refill() {
+            m_pos = 0;
+            m_in.read(reinterpret_cast<char*>(m_buf.data()),
+                      static_cast<std::streamsize>(m_buf.size() * sizeof(uint64_t)));
+            const std::streamsize got = m_in.gcount();
+            m_size = static_cast<uint64_t>(got) / sizeof(uint64_t);
+            if (m_size == 0) {
+                throw std::runtime_error("disk_backed_offsets_builder: read past end of file");
+            }
+        }
+    };
+
+    /* Construct a reader positioned at `start_index`. Requires freeze(). */
+    reader make_reader(uint64_t start_index,
+                       uint64_t buffer_records = default_reader_buffer_records) const {
+        if (!m_frozen) {
+            throw std::runtime_error(
+                "disk_backed_offsets_builder: must freeze() before make_reader()");
+        }
+        reader r;
+        r.open(m_filename, start_index, buffer_records);
+        return r;
+    }
+
+    /*
+        A copyable forward iterator over the entire offsets file, suitable
+        for the `Iterator`-template `encode` / `build` calls in
+        `bits::endpoints_sequence` and `bits::compact_vector`. Holds the
+        underlying ifstream via shared_ptr so the iterator can be copied
+        (those APIs may copy the iterator internally).
+    */
+    struct full_iterator {
+        using iterator_category = std::forward_iterator_tag;
+        using value_type = uint64_t;
+        using difference_type = std::ptrdiff_t;
+        using reference = uint64_t const&;
+        using pointer = uint64_t const*;
+
+        full_iterator() = default;
+
+        void open(std::string const& filename,
+                  uint64_t buffer_records = default_reader_buffer_records) {
+            m_state = std::make_shared<state>();
+            m_state->buf.assign(std::max<uint64_t>(1, buffer_records), 0);
+            m_state->in.open(filename, std::ifstream::binary);
+            if (!m_state->in.is_open()) {
+                throw std::runtime_error("cannot open offsets tmp file '" + filename + "'");
+            }
+            m_state->refill();
+        }
+
+        uint64_t operator*() const {
+            assert(m_state && m_state->pos < m_state->size);
+            return m_state->buf[m_state->pos];
+        }
+        full_iterator& operator++() {
+            assert(m_state);
+            ++m_state->pos;
+            if (m_state->pos >= m_state->size && !m_state->eof) m_state->refill();
+            return *this;
+        }
+
+    private:
+        struct state {
+            std::ifstream in;
+            std::vector<uint64_t> buf;
+            uint64_t pos = 0;
+            uint64_t size = 0;
+            bool eof = false;
+            void refill() {
+                pos = 0;
+                in.read(reinterpret_cast<char*>(buf.data()),
+                        static_cast<std::streamsize>(buf.size() * sizeof(uint64_t)));
+                const std::streamsize got = in.gcount();
+                size = static_cast<uint64_t>(got) / sizeof(uint64_t);
+                if (size == 0) eof = true;
+            }
+        };
+        std::shared_ptr<state> m_state;
+    };
+
+    /*
+        Stream the offsets file into the target Offsets structure (mirrors
+        the in-RAM builder's `build`). After return, the file is removed
+        and `size()` resets to 0 to match the in-RAM builder, which clears
+        its m_v in `build`.
+    */
+    void build(Offsets& target) {
+        if (!m_frozen) freeze();
+        if (m_size == 0) {
+            remove_file();
+            reset_state();
+            return;
+        }
+
+        if constexpr (std::is_same_v<Offsets, decoded_offsets>) {
+            full_iterator it;
+            it.open(m_filename);
+            target.m_seq.encode(it, m_size, m_back);
+        } else {
+            full_iterator it;
+            it.open(m_filename);
+            target.m_seq.build(it, m_size, m_nb.per_absolute_offset);
+            target.m_num_bits_per_relative_offset = m_nb.per_relative_offset;
+        }
+
+        remove_file();
+        reset_state();
+    }
+
+    /* Remove the on-disk tmp file (if any). */
+    void remove_file() {
+        if (m_writer.is_open()) m_writer.close();
+        if (!m_filename.empty()) std::remove(m_filename.c_str());
+    }
+
+private:
+    std::string m_filename;
+    std::ofstream m_writer;
+    std::vector<uint64_t> m_buf;
+    uint64_t m_writer_buffer_capacity = default_writer_buffer_records;
+    uint64_t m_size = 0;
+    uint64_t m_front = 0;
+    uint64_t m_back = 0;
+    bool m_have_front = false;
+    bool m_frozen = false;
+    num_bits m_nb;
+
+    void flush_buffer() {
+        if (m_buf.empty()) return;
+        m_writer.write(reinterpret_cast<char const*>(m_buf.data()),
+                       static_cast<std::streamsize>(m_buf.size() * sizeof(uint64_t)));
+        m_buf.clear();
+    }
+
+    void reset_state() {
+        m_size = 0;
+        m_buf.clear();
+        m_buf.shrink_to_fit();
+        m_have_front = false;
+        m_front = 0;
+        m_back = 0;
+        m_frozen = false;
+    }
+};
+
+}  // namespace sshash
diff --git a/include/offsets.hpp b/include/offsets.hpp
index e718ed3..b592e7b 100644
--- a/include/offsets.hpp
+++ b/include/offsets.hpp
@@ -5,6 +5,8 @@
 
 namespace sshash {
 
+template <typename> struct disk_backed_offsets_builder;
+
 struct num_bits {
     num_bits() : per_absolute_offset(0), per_relative_offset(0), per_string_id(0) {}
     uint64_t per_absolute_offset;
@@ -101,6 +103,11 @@ struct offsets  //
         visit_impl(visitor, *this);
     }
 
+    /* Allow disk_backed_offsets_builder to populate m_seq directly via a
+       streaming forward iterator (mirroring what `Seq`'s nested builder
+       does, but with on-disk values). */
+    template <typename> friend struct disk_backed_offsets_builder;
+
 protected:
     Seq m_seq;
     uint64_t m_num_bits_per_relative_offset;
diff --git a/src/builder/compute_minimizer_tuples.cpp b/src/builder/compute_minimizer_tuples.cpp
index 8458857..a3e98ec 100644
--- a/src/builder/compute_minimizer_tuples.cpp
+++ b/src/builder/compute_minimizer_tuples.cpp
@@ -49,14 +49,19 @@ void dictionary_builder<Kmer, Offsets>::compute_minimizer_tuples()  //
 
             auto strings_reader = strings_builder.make_reader();
             kmer_iterator<Kmer, disk_backed_strings::reader> kmer_it(strings_reader, k);
+            /* Per-thread forward reader over the offsets file, positioned
+               so the first `next()` returns offsets[index_begin]. */
+            auto offsets_reader = strings_offsets_builder.make_reader(index_begin);
+            uint64_t prev_offset = offsets_reader.next();  // == offsets[index_begin]
             hasher_type hasher(build_config.seed);
             minimizer_iterator<Kmer> minimizer_it(k, m, hasher);
             minimizer_iterator_rc<Kmer> minimizer_it_rc(k, m, hasher);
 
             for (uint64_t i = index_begin; i < index_end; ++i)  //
             {
-                const uint64_t begin = strings_offsets_builder[i];
-                const uint64_t end = strings_offsets_builder[i + 1];
+                const uint64_t begin = prev_offset;
+                const uint64_t end = offsets_reader.next();  // offsets[i + 1]
+                prev_offset = end;
                 const uint64_t sequence_len = end - begin;
                 assert(sequence_len >= k);
 

From 27c71e8f1d8929a2c739492da63c06a913f2ce19 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 5 May 2026 16:28:01 +0000
Subject: [PATCH 11/32] spill the step-7 compact_vectors to disk; concatenate
 at save

The four largest in-RAM "stored output" structures during the
build are the sparse-index compact_vectors (control_codewords,
mid_load_buckets, heavy_load_buckets) and the per-skew-partition
cvb_positions. For huge inputs these can each be many GB. This
commit makes the build never materialize them in RAM:

  - new streaming_compact_vector_writer: writes the same byte
    layout as bits::compact_vector::visit_impl (size, width,
    mask, owning_span<u64>) directly to a file via a 2-word
    rolling window, accepting set(index, value) in monotonic
    index order. Matches the +1 padding word that the in-RAM
    builder allocates so on-disk bytes are identical.

  - control_codewords: indices are mphf(minimizer), and the
    merged minimizers file is sorted by mphf hash, so writes
    during the combined pass are strictly monotonic. Streamed
    directly via the writer; no external sort.

  - heavy_load_buckets: single monotone cursor, also streamed
    directly via the writer.

  - mid_load_buckets: per-size cursors interleave across the
    pass, so each size class's positions are written to its
    own raw-uint64 tmp file (monotonic within size). After the
    combined pass, the per-size files are streamed in size
    order through a streaming_compact_vector_writer to assemble
    the final mid_load_buckets file.

  - cvb_positions per skew partition: writes are random by
    F(kmer). After the partition's MPHF is built, a second
    pass over the partition's kmer file emits
    (F(kmer), pos_in_bucket) tuples through the existing
    parallel_sort + flush + file_merging_iterator external-sort
    machinery; the sorted stream feeds a per-partition
    streaming_compact_vector_writer.

The streaming saver is extended to take a substitution map
keyed by `bits::compact_vector const*`. dictionary_builder
populates the dictionary's compact_vector slots with empty
placeholders, takes their addresses, and registers each
spilled tmp file. The save pass copies bytes from each tmp
file at the matching visit slot.

For the materializing `build()` flow (used by --check), a new
`materialize_spilled_into(d)` step re-loads each spilled tmp
file back into the dictionary's in-RAM compact_vectors via
essentials::loader, so queries work afterward. This brings
the RAM peak back briefly at the very end (acceptable since
--check inherently needs the full index in RAM).

`build_streaming_save()` never materializes; the spilled tmp
files are concatenated into the output by the saver and then
removed.

Verified byte-identical output vs the previous commit on
salmonella_enterica m=7, plus full --check on regular,
--canonical, multi-thread (-t 4), and --weighted, plus a
streaming-save round-trip (sshash check + sshash query) with
all five "EVERYTHING OK!" suites and 100% positive matches.
No tmp file leaks.

Remaining proportional-to-input items in RAM during build:
the codewords MPHF (step 4) and the per-skew-partition MPHFs
(step 7.2 phase C). pthash returns these as in-memory
structs; spilling them would require pthash changes or an
intermediate save/load step. Everything else is now bounded
by the explicit --ram-limit.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 include/builder/dictionary_builder.hpp        | 116 +++++-
 .../streaming_compact_vector_writer.hpp       | 144 ++++++++
 include/builder/streaming_save.hpp            |  68 +++-
 src/builder/build_sparse_and_skew_index.cpp   | 341 +++++++++++++-----
 4 files changed, 552 insertions(+), 117 deletions(-)
 create mode 100644 include/builder/streaming_compact_vector_writer.hpp

diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp
index 6d30fb5..7e4af93 100644
--- a/include/builder/dictionary_builder.hpp
+++ b/include/builder/dictionary_builder.hpp
@@ -1,16 +1,52 @@
 #pragma once
 
+#include <unordered_map>
+
 #include "essentials.hpp"
 #include "include/dictionary.hpp"
 #include "include/offsets.hpp"
 #include "include/builder/util.hpp"
 #include "include/builder/disk_backed_strings.hpp"
 #include "include/builder/disk_backed_offsets_builder.hpp"
+#include "include/builder/streaming_compact_vector_writer.hpp"
 #include "include/builder/streaming_save.hpp"
 #include "include/buckets_statistics.hpp"
 
 namespace sshash {
 
+/*
+    Helper: load a serialized bits::compact_vector back from a tmp file
+    into the given in-RAM compact_vector. Used by the materializing build
+    flow (after step 7) so that --check / queries can run.
+*/
+inline void materialize_compact_vector_from_file(bits::compact_vector& cv,
+                                                 std::string const& filename) {
+    essentials::loader loader(filename.c_str());
+    loader.visit(cv);
+}
+
+/*
+    Tmp file paths for the compact_vectors that step 7 spills to disk.
+    Populated by build_sparse_and_skew_index; consumed by step 8 (either
+    materialized back into RAM for `build()`, or injected into the output
+    by `build_streaming_save()`).
+*/
+struct spilled_components {
+    std::string control_codewords_path;
+    std::string mid_load_buckets_path;
+    std::string heavy_load_buckets_path;
+    std::vector<std::string> skew_positions_paths;  // one entry per skew partition
+
+    void clear_files() {
+        if (!control_codewords_path.empty()) std::remove(control_codewords_path.c_str());
+        if (!mid_load_buckets_path.empty()) std::remove(mid_load_buckets_path.c_str());
+        if (!heavy_load_buckets_path.empty()) std::remove(heavy_load_buckets_path.c_str());
+        for (auto const& p : skew_positions_paths) {
+            if (!p.empty()) std::remove(p.c_str());
+        }
+    }
+};
+
 template <typename Kmer, typename Offsets>
 struct dictionary_builder  //
 {
@@ -24,29 +60,32 @@ struct dictionary_builder  //
     ~dictionary_builder() {
         strings_builder.remove_file();
         strings_offsets_builder.remove_file();
+        spilled.clear_files();
     }
 
     /*
-        Build a query-ready dictionary in `d`. After this returns,
-        `d.m_spss.strings` is materialized in RAM (peak briefly equals the
-        strings size). Use this when the caller needs to query `d` post-build
-        (e.g., `--check`).
+        Build a query-ready dictionary in `d`. After this returns, all
+        spilled components and `d.m_spss.strings` are materialized in RAM
+        (peak briefly equals the index size). Use this when the caller
+        needs to query `d` post-build (e.g., `--check`).
     */
     void build(dictionary<Kmer, Offsets>& d, std::string const& filename) {
         run_steps_1_through_7(d, filename);
-        do_step("step 8 (materialize strings to RAM)", [&]() {
+        do_step("step 8 (materialize spilled components to RAM)", [&]() {
+            materialize_spilled_into(d);
             strings_builder.load_into(d.m_spss.strings);
             strings_builder.remove_file();
+            spilled.clear_files();
         });
         finalize_stats(d);
     }
 
     /*
         Build the dictionary and stream-save it to `output_filename` without
-        ever materializing `strings` in RAM. After this returns, `d` is *not*
-        query-ready (`d.m_spss.strings` is empty). Use this when the caller
-        only needs the on-disk index file and wants to keep peak RAM bounded
-        by the build phase.
+        ever materializing the spilled components or `strings` in RAM.
+        After this returns, `d` is *not* query-ready. Use this when the
+        caller only needs the on-disk index file and wants to keep peak RAM
+        bounded by the build phase.
     */
     void build_streaming_save(dictionary<Kmer, Offsets>& d,                  //
                               std::string const& filename,                   //
@@ -54,8 +93,35 @@ struct dictionary_builder  //
     {
         run_steps_1_through_7(d, filename);
         do_step("step 8 (stream-save dictionary to disk)", [&]() {
-            save_streaming(d, output_filename.c_str(), &d.m_spss.strings, strings_builder);
+            /* Populate placeholder compact_vectors at the visit slots whose
+               byte content the saver will substitute from disk tmp files. */
+            std::unordered_map<bits::compact_vector const*, std::string> subs;
+            if (!spilled.control_codewords_path.empty()) {
+                subs[&d.m_ssi.codewords.control_codewords] = spilled.control_codewords_path;
+            }
+            if (!spilled.mid_load_buckets_path.empty()) {
+                subs[&d.m_ssi.mid_load_buckets] = spilled.mid_load_buckets_path;
+            }
+            if (!spilled.heavy_load_buckets_path.empty()) {
+                subs[&d.m_ssi.ski.heavy_load_buckets] = spilled.heavy_load_buckets_path;
+            }
+            /* skew positions: populate the owning_span with placeholders so
+               the visit walks the right number of entries and we can take
+               their addresses for substitution. */
+            const std::size_t num_part = spilled.skew_positions_paths.size();
+            if (num_part > 0) {
+                std::vector<bits::compact_vector> placeholders(num_part);
+                d.m_ssi.ski.positions = std::move(placeholders);
+                for (std::size_t i = 0; i != num_part; ++i) {
+                    if (!spilled.skew_positions_paths[i].empty()) {
+                        subs[&d.m_ssi.ski.positions[i]] = spilled.skew_positions_paths[i];
+                    }
+                }
+            }
+            save_streaming(d, output_filename.c_str(), &d.m_spss.strings, strings_builder,
+                           std::move(subs));
             strings_builder.remove_file();
+            spilled.clear_files();
         });
         finalize_stats(d);
     }
@@ -66,6 +132,7 @@ struct dictionary_builder  //
     disk_backed_offsets_builder<Offsets> strings_offsets_builder;
     disk_backed_strings strings_builder;
     weights::builder weights_builder;
+    spilled_components spilled;
 
     uint64_t strings_run_id;
 
@@ -74,6 +141,35 @@ struct dictionary_builder  //
     uint64_t total_time_musec;
 
 private:
+    /* Load each spilled compact_vector tmp file back into the corresponding
+       in-RAM compact_vector inside `d`. Used by the materializing build
+       flow so queries can run against `d` (e.g., during --check). */
+    void materialize_spilled_into(dictionary<Kmer, Offsets>& d) {
+        if (!spilled.control_codewords_path.empty()) {
+            materialize_compact_vector_from_file(d.m_ssi.codewords.control_codewords,
+                                                 spilled.control_codewords_path);
+        }
+        if (!spilled.mid_load_buckets_path.empty()) {
+            materialize_compact_vector_from_file(d.m_ssi.mid_load_buckets,
+                                                 spilled.mid_load_buckets_path);
+        }
+        if (!spilled.heavy_load_buckets_path.empty()) {
+            materialize_compact_vector_from_file(d.m_ssi.ski.heavy_load_buckets,
+                                                 spilled.heavy_load_buckets_path);
+        }
+        const std::size_t num_part = spilled.skew_positions_paths.size();
+        if (num_part > 0) {
+            std::vector<bits::compact_vector> positions_vec(num_part);
+            for (std::size_t i = 0; i != num_part; ++i) {
+                if (!spilled.skew_positions_paths[i].empty()) {
+                    materialize_compact_vector_from_file(positions_vec[i],
+                                                         spilled.skew_positions_paths[i]);
+                }
+            }
+            d.m_ssi.ski.positions = std::move(positions_vec);
+        }
+    }
+
     void run_steps_1_through_7(dictionary<Kmer, Offsets>& d, std::string const& filename) {
         d.m_k = build_config.k;
         d.m_m = build_config.m;
diff --git a/include/builder/streaming_compact_vector_writer.hpp b/include/builder/streaming_compact_vector_writer.hpp
new file mode 100644
index 0000000..ea9dce1
--- /dev/null
+++ b/include/builder/streaming_compact_vector_writer.hpp
@@ -0,0 +1,144 @@
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <fstream>
+#include <stdexcept>
+#include <string>
+
+namespace sshash {
+
+/*
+    Streams a `bits::compact_vector` to disk one entry at a time, accepting
+    `set(index, value)` calls in monotonically non-decreasing index order
+    (gaps are filled with zero, matching the default-zero semantics of an
+    in-RAM compact_vector::builder).
+
+    The on-disk byte layout matches `bits::compact_vector::visit_impl`:
+        uint64_t   m_size
+        uint64_t   m_width
+        uint64_t   m_mask
+        size_t     n              (= ceil(m_size * m_width / 64))
+        uint64_t   m_data[n]      (little-endian bit-packed)
+
+    Total RAM footprint: a 2-word rolling window plus the std::ofstream's
+    own buffer. Independently of the number of entries.
+*/
+struct streaming_compact_vector_writer {
+    streaming_compact_vector_writer() = default;
+    streaming_compact_vector_writer(streaming_compact_vector_writer const&) = delete;
+    streaming_compact_vector_writer& operator=(streaming_compact_vector_writer const&) = delete;
+
+    void open(std::string const& filename, uint64_t num_entries, uint64_t width) {
+        if (width == 0) throw std::runtime_error("streaming_compact_vector_writer: width must be > 0");
+        if (width > 64) throw std::runtime_error("streaming_compact_vector_writer: width must be <= 64");
+        m_filename = filename;
+        m_num_entries = num_entries;
+        m_width = width;
+        /* Match `bits::compact_vector::builder`'s data layout, which
+           allocates `words_for(size*width) + 1` words: the trailing
+           padding word allows in-RAM `set_bits` to write across word
+           boundaries without bounds checking and is part of the
+           serialized `m_data` owning_span. */
+        const uint64_t packed_words = (num_entries == 0) ? 0 : (num_entries * width + 63) / 64;
+        m_total_words = packed_words + 1;
+        m_words_written = 0;
+        m_buf[0] = 0;
+        m_buf[1] = 0;
+        m_have_last_index = false;
+
+        m_out.open(filename, std::ofstream::binary | std::ofstream::trunc);
+        if (!m_out.is_open()) {
+            throw std::runtime_error("cannot open compact_vector tmp file '" + filename + "'");
+        }
+
+        /* Header (matches bits::compact_vector::visit_impl). */
+        write_pod(m_num_entries);
+        write_pod(m_width);
+        const uint64_t mask = (m_width == 64) ? uint64_t(-1) : ((uint64_t(1) << m_width) - 1);
+        write_pod(mask);
+        const std::size_t n = static_cast<std::size_t>(m_total_words);
+        write_pod(n);
+    }
+
+    /* Write a value at position `index`. Successive calls must satisfy
+       `index >= previous_index`; gaps are filled with zero. */
+    void set(uint64_t index, uint64_t value) {
+        if (m_have_last_index) {
+            assert(index >= m_last_index);
+        }
+        m_have_last_index = true;
+        m_last_index = index;
+
+        const uint64_t bit_offset = index * m_width;
+        const uint64_t word_index = bit_offset / 64;
+        const uint64_t bit_in_word = bit_offset % 64;
+
+        /* Slide the 2-word window forward to cover word_index. Words below
+           are now finalized; emit them. */
+        while (m_words_written < word_index) {
+            write_word(m_buf[0]);
+            m_buf[0] = m_buf[1];
+            m_buf[1] = 0;
+            ++m_words_written;
+        }
+
+        /* OR `value` (m_width low bits of it) into the window starting at
+           bit_in_word of m_buf[0]; overflow goes into m_buf[1]. */
+        const uint64_t fits_in_word_0 = 64 - bit_in_word;
+        if (m_width <= fits_in_word_0) {
+            if (m_width == 64) {
+                /* bit_in_word must be 0 here */
+                m_buf[0] = value;
+            } else {
+                m_buf[0] |= value << bit_in_word;
+            }
+        } else {
+            m_buf[0] |= value << bit_in_word;
+            m_buf[1] |= value >> fits_in_word_0;
+        }
+    }
+
+    /* Flush remaining buffered words and close the file. */
+    void finalize() {
+        while (m_words_written < m_total_words) {
+            write_word(m_buf[0]);
+            m_buf[0] = m_buf[1];
+            m_buf[1] = 0;
+            ++m_words_written;
+        }
+        if (m_out.is_open()) m_out.close();
+    }
+
+    std::string const& filename() const { return m_filename; }
+    uint64_t num_entries() const { return m_num_entries; }
+    uint64_t width() const { return m_width; }
+
+    void remove_file() {
+        if (m_out.is_open()) m_out.close();
+        if (!m_filename.empty()) std::remove(m_filename.c_str());
+    }
+
+private:
+    std::string m_filename;
+    std::ofstream m_out;
+    uint64_t m_num_entries = 0;
+    uint64_t m_width = 0;
+    uint64_t m_total_words = 0;
+    uint64_t m_words_written = 0;
+    uint64_t m_buf[2] = {0, 0};
+    uint64_t m_last_index = 0;
+    bool m_have_last_index = false;
+
+    template <typename T>
+    void write_pod(T const& v) {
+        m_out.write(reinterpret_cast<char const*>(&v), sizeof(T));
+    }
+    void write_word(uint64_t w) {
+        m_out.write(reinterpret_cast<char const*>(&w), sizeof(uint64_t));
+    }
+};
+
+}  // namespace sshash
diff --git a/include/builder/streaming_save.hpp b/include/builder/streaming_save.hpp
index 73e4315..9e9de1d 100644
--- a/include/builder/streaming_save.hpp
+++ b/include/builder/streaming_save.hpp
@@ -1,13 +1,16 @@
 #pragma once
 
+#include <cstdint>
 #include <fstream>
 #include <stdexcept>
 #include <string>
 #include <type_traits>
+#include <unordered_map>
 #include <vector>
 
 #include "essentials.hpp"
 #include "external/pthash/external/bits/include/bit_vector.hpp"
+#include "external/pthash/external/bits/include/compact_vector.hpp"
 
 #include "include/builder/disk_backed_strings.hpp"
 
@@ -17,17 +20,25 @@ namespace sshash {
     A saver that mirrors `essentials::generic_saver`, except that any visit
     to a specific `bits::bit_vector` instance (identified by address) is
     redirected to `disk_backed_strings::save_to`, which streams the strings
-    bytes from the on-disk tmp file. All other visits go through the regular
-    `essentials` path.
+    bytes from the on-disk tmp file. Likewise, visits to `bits::compact_vector`
+    instances whose addresses appear in `compact_vector_subs` are replaced
+    with byte-for-byte streaming from the corresponding tmp file (which is
+    expected to be in `bits::compact_vector::visit_impl`'s on-disk format).
 
-    Using address-based identification means we don't need to add any
-    intermediate type or marker to `bits::bit_vector` itself.
+    Address-based identification means we don't need to add any intermediate
+    type or marker to bits::bit_vector / bits::compact_vector themselves.
 */
 struct streaming_strings_saver {
-    streaming_strings_saver(std::ostream& os,                                 //
-                            bits::bit_vector const* strings_addr,             //
-                            disk_backed_strings const* strings_storage)       //
-        : m_os(os), m_strings_addr(strings_addr), m_strings_storage(strings_storage) {
+    streaming_strings_saver(
+        std::ostream& os,                                                                //
+        bits::bit_vector const* strings_addr,                                            //
+        disk_backed_strings const* strings_storage,                                      //
+        std::unordered_map<bits::compact_vector const*, std::string> compact_vector_subs //
+        )
+        : m_os(os)
+        , m_strings_addr(strings_addr)
+        , m_strings_storage(strings_storage)
+        , m_compact_vector_subs(std::move(compact_vector_subs)) {
         if (m_strings_addr == nullptr || m_strings_storage == nullptr) {
             throw std::runtime_error("streaming_strings_saver requires non-null arguments");
         }
@@ -41,6 +52,13 @@ struct streaming_strings_saver {
                 return;
             }
         }
+        if constexpr (std::is_same_v<T, bits::compact_vector>) {
+            auto it = m_compact_vector_subs.find(&val);
+            if (it != m_compact_vector_subs.end()) {
+                stream_file_into_os(it->second);
+                return;
+            }
+        }
         if constexpr (essentials::is_pod<T>::value) {
             essentials::save_pod(m_os, val);
         } else {
@@ -64,6 +82,7 @@ struct streaming_strings_saver {
     std::ostream& m_os;
     bits::bit_vector const* m_strings_addr;
     disk_backed_strings const* m_strings_storage;
+    std::unordered_map<bits::compact_vector const*, std::string> m_compact_vector_subs;
 
     template <typename Vec>
     void visit_seq(Vec const& vec) {
@@ -77,23 +96,42 @@ struct streaming_strings_saver {
             for (auto const& v : vec) visit(v);
         }
     }
+
+    void stream_file_into_os(std::string const& filename) {
+        std::ifstream in(filename, std::ifstream::binary);
+        if (!in.is_open()) {
+            throw std::runtime_error("cannot open spilled component file '" + filename + "'");
+        }
+        char buf[64 * 1024];
+        while (in.good()) {
+            in.read(buf, sizeof(buf));
+            const std::streamsize got = in.gcount();
+            if (got > 0) m_os.write(buf, got);
+        }
+        in.close();
+    }
 };
 
 /*
-    Save `t` to `filename`, streaming any embedded `bits::bit_vector` whose
-    address matches `strings_addr` from `strings_storage` instead of from
-    RAM. Other fields are saved using the standard `essentials` path.
+    Save `t` to `filename`. Any embedded bits::bit_vector matching
+    `strings_addr` is streamed from `strings_storage`; any embedded
+    bits::compact_vector whose address appears in `compact_vector_subs`
+    has its bytes copied from the corresponding tmp file. Other fields are
+    saved via the standard essentials path.
 */
 template <typename T>
-void save_streaming(T const& t, char const* filename,                    //
-                    bits::bit_vector const* strings_addr,                //
-                    disk_backed_strings const& strings_storage)          //
+void save_streaming(T const& t, char const* filename,                                  //
+                    bits::bit_vector const* strings_addr,                              //
+                    disk_backed_strings const& strings_storage,                        //
+                    std::unordered_map<bits::compact_vector const*, std::string>       //
+                        compact_vector_subs = {})                                      //
 {
     std::ofstream out(filename, std::ios::binary);
     if (!out.good()) {
         throw std::runtime_error(std::string("error opening file '") + filename + "' for writing");
     }
-    streaming_strings_saver saver(out, strings_addr, &strings_storage);
+    streaming_strings_saver saver(out, strings_addr, &strings_storage,
+                                  std::move(compact_vector_subs));
     saver.visit(t);
     out.close();
 }
diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp
index de7c963..3e9851d 100644
--- a/src/builder/build_sparse_and_skew_index.cpp
+++ b/src/builder/build_sparse_and_skew_index.cpp
@@ -1,4 +1,7 @@
+#include <functional>
+
 #include "include/buckets_statistics.hpp"
+#include "include/builder/streaming_compact_vector_writer.hpp"
 
 namespace sshash {
 
@@ -36,6 +39,26 @@ struct kmer_extraction_request {
 };
 #pragma pack(pop)
 
+/*
+    A (mphf_pos, pos_in_bucket) record used to spill the per-skew-partition
+    `cvb_positions` to disk. We external-sort these by mphf_pos so the
+    streaming compact_vector writer can pack the final cvb_positions file in
+    a single forward pass.
+*/
+#pragma pack(push, 4)
+struct position_tuple {
+    position_tuple() {}
+    position_tuple(uint64_t mphf_pos, uint32_t pib) : mphf_pos(mphf_pos), pib(pib) {}
+
+    bool operator<(position_tuple const& o) const { return mphf_pos < o.mphf_pos; }
+    bool operator>(position_tuple const& o) const { return mphf_pos > o.mphf_pos; }
+    static position_tuple max() { return {uint64_t(-1), uint32_t(-1)}; }
+
+    uint64_t mphf_pos;
+    uint32_t pib;
+};
+#pragma pack(pop)
+
 /*
     Forward iterator over a per-skew-partition tmp file produced by step
     7.2 phase (B). Each record is `(kmer.bits, uint32_t pos_in_bucket)`.
@@ -157,8 +180,6 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         Calculate bits needed for control codewords encoding.
         Encoding format:
             ((list_id << min_l) | (bucket_size - 2)) << 2 | status_code
-        We need: 2 bits (status) + min_l bits (bucket_size) + bits for list_id.
-        list_id is bounded by the maximum number of buckets sharing the same size.
     */
     const uint64_t bits_for_list_id =
         std::ceil(std::log2(buckets_stats.max_sparse_buckets_per_size() + 1));
@@ -181,7 +202,7 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     } else if (max_bucket_size < (1ULL << constants::max_l)) {
         num_partitions = log2_max_bucket_size - constants::min_l;
     }
-    assert(num_partitions <= 8);  // so that we need 3 bits to encode a partition_id
+    assert(num_partitions <= 8);
 
     if (build_config.verbose) {
         std::cout << "num_buckets_larger_than_1_not_in_skew_index "
@@ -197,51 +218,72 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         std::cout << "max_bucket_size " << max_bucket_size << std::endl;
         std::cout << "log2_max_bucket_size " << log2_max_bucket_size << std::endl;
         std::cout << "num_partitions in skew index " << num_partitions << std::endl;
-        std::cout << "num_minimizer_positions_of_buckets_larger_than_1 "
-                  << num_minimizer_positions_of_buckets_larger_than_1 << "/"
-                  << num_minimizer_positions << " ("
-                  << (num_minimizer_positions_of_buckets_larger_than_1 * 100.0) /
-                         num_minimizer_positions
-                  << "%)" << std::endl;
-        std::cout << "num_minimizer_positions_of_buckets_in_skew_index "
-                  << num_minimizer_positions_of_buckets_in_skew_index << "/"
-                  << num_minimizer_positions << " ("
-                  << (num_minimizer_positions_of_buckets_in_skew_index * 100.0) /
-                         num_minimizer_positions
-                  << "%)" << std::endl;
     }
 
-    /* Materialize strings_offsets now: needed below to decode pos_in_seq
-       into absolute offsets when emitting heavy-bucket kmer requests.
-       `d.m_spss.strings` is materialized later in step 8 (or stream-saved
-       directly to disk). */
+    /* Materialize strings_offsets now (it's needed below to decode
+       pos_in_seq into absolute offsets when emitting heavy-bucket kmer
+       requests). `d.m_spss.strings` is materialized later (step 8) or
+       stream-saved directly. */
     strings_offsets_builder.build(d.m_spss.strings_offsets);
 
     /* Precompute the layout of mid_load_buckets from the bucket-size
        histogram. begin_buckets_of_size[s] is the start offset (in
-       positions, not bits) of size-s bucket positions in mid_load_buckets;
-       it lets us write each bucket's positions in place during the
-       single-pass build, without needing to sort buckets by size. */
+       positions, not bits) of size-s bucket positions in mid_load_buckets. */
     std::vector<uint32_t> begin_buckets_of_size(min_size + 1, 0);
     for (uint64_t s = 3; s <= min_size; ++s) {
         begin_buckets_of_size[s] = static_cast<uint32_t>(  //
             begin_buckets_of_size[s - 1] +
             buckets_stats.num_buckets_of_size(s - 1) * (s - 1));
     }
+    d.m_ssi.begin_buckets_of_size = std::move(begin_buckets_of_size);
+
+    /* All step-7.1 outputs are spilled to disk; the in-RAM dictionary
+       fields stay empty (they're populated later either from disk for
+       --check or substituted by the streaming saver). */
+    const uint64_t step7_run_id = pthash::clock_type::now().time_since_epoch().count();
+    auto step7_path = [&](std::string const& tag) {
+        std::stringstream ss;
+        ss << build_config.tmp_dirname << "/sshash.tmp.run_" << step7_run_id << "." << tag
+           << ".bin";
+        return ss.str();
+    };
+
+    spilled.control_codewords_path = step7_path("control_codewords");
+    spilled.mid_load_buckets_path = step7_path("mid_load_buckets");
+    spilled.heavy_load_buckets_path = step7_path("heavy_load_buckets");
+
+    /* Streaming writers for the two compact_vectors that get strictly
+       monotonic indices during the combined pass (control_codewords:
+       indexed by bucket_id == mphf hash, monotonic across buckets in
+       file order; heavy_load_buckets: indexed by a single monotone
+       cursor advanced inside the heavy branch). */
+    streaming_compact_vector_writer control_codewords_writer;
+    control_codewords_writer.open(spilled.control_codewords_path, num_minimizers,
+                                  num_bits_for_control);
+    streaming_compact_vector_writer heavy_load_writer;
+    heavy_load_writer.open(spilled.heavy_load_buckets_path,
+                           num_minimizer_positions_of_buckets_in_skew_index, num_bits_per_offset);
+
+    /* mid_load: per-size tmp files of raw uint64_t positions. Each file is
+       written monotonically within its size class. After the combined
+       pass we stream them in size order through a streaming
+       compact_vector writer to assemble the final mid_load_buckets file. */
+    auto mid_load_per_size_path = [&](uint64_t s) {
+        std::stringstream ss;
+        ss << build_config.tmp_dirname << "/sshash.tmp.run_" << step7_run_id
+           << ".mid_load_size_" << s << ".bin";
+        return ss.str();
+    };
+    std::vector<std::ofstream> mid_load_per_size(min_size + 1);
+    for (uint64_t s = 2; s <= min_size; ++s) {
+        if (buckets_stats.num_buckets_of_size(s) == 0) continue;
+        mid_load_per_size[s].open(mid_load_per_size_path(s),
+                                  std::ofstream::binary | std::ofstream::trunc);
+        if (!mid_load_per_size[s].is_open()) {
+            throw std::runtime_error("cannot open mid_load per-size tmp file");
+        }
+    }
 
-    bits::compact_vector::builder control_codewords_builder;
-    bits::compact_vector::builder mid_load_buckets_builder;
-    bits::compact_vector::builder heavy_load_buckets_builder;
-    control_codewords_builder.resize(num_minimizers, num_bits_for_control);
-    mid_load_buckets_builder.resize(num_minimizer_positions_of_buckets_larger_than_1,
-                                    num_bits_per_offset);
-    heavy_load_buckets_builder.resize(num_minimizer_positions_of_buckets_in_skew_index,
-                                      num_bits_per_offset);
-
-    /* Per-size cursor for mid_load (initialized to begin_buckets_of_size)
-       and per-size list_id counter; monotone cursor for heavy_load. */
-    std::vector<uint64_t> mid_load_cursor(min_size + 1, 0);
-    for (uint64_t s = 2; s <= min_size; ++s) mid_load_cursor[s] = begin_buckets_of_size[s];
     std::vector<uint64_t> list_id_per_size(min_size + 1, 0);
     uint64_t heavy_load_cursor = 0;
 
@@ -281,9 +323,6 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
                       });
         const uint64_t id = num_request_runs.fetch_add(1);
         const std::string fn = request_run_filename(id);
-        if (build_config.verbose) {
-            std::cout << "saving to file '" << fn << "'..." << std::endl;
-        }
         std::ofstream out(fn, std::ofstream::binary);
         if (!out.is_open()) throw std::runtime_error("cannot open file");
         out.write(reinterpret_cast<char const*>(request_buffer.data()),
@@ -307,9 +346,15 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
 
     /*
         Combined pass: stream the merged minimizers file once and, per
-        bucket, write the appropriate part of the sparse index. For heavy
-        buckets we also emit kmer-extraction requests in-line (what was
-        formerly step 7.2 phase A). No mmap; no in-RAM `buckets` array.
+        bucket, write the appropriate part of the sparse index DIRECTLY TO
+        DISK via streaming compact_vector writers (control_codewords and
+        heavy_load_buckets) or per-size raw-value tmp files (mid_load).
+        For heavy buckets we also emit kmer-extraction requests in-line.
+
+        Buckets are visited in mphf-hash (= bucket_id) order, so writes to
+        control_codewords are strictly monotonic. heavy_load_cursor is also
+        monotonic across the whole pass. mid_load per-size cursors are
+        each monotonic within their size class.
     */
     {
         streaming_minimizer_bucket_reader reader;
@@ -332,43 +377,43 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
                 /* Singleton: code = |offset|0|, LSB = 0. */
                 const uint64_t code = bucket_buf.front().pos_in_seq << 1;
                 assert(code < (uint64_t(1) << num_bits_for_control));
-                control_codewords_builder.set(bucket_id, code);
+                control_codewords_writer.set(bucket_id, code);
             } else if (bucket_size <= min_size) {
-                /* Mid-load: write positions at the per-size cursor and
-                   assign the next list_id for this size. */
+                /* Mid-load: write positions to per-size raw file at the
+                   per-size cursor; assign the next list_id for this size. */
                 const uint64_t list_id = list_id_per_size[bucket_size]++;
                 const uint64_t code =
                     (((list_id << constants::min_l) | (bucket_size - 2)) << 2) | 1;
                 assert(code < (uint64_t(1) << num_bits_for_control));
-                control_codewords_builder.set(bucket_id, code);
+                control_codewords_writer.set(bucket_id, code);
 
-                uint64_t cursor = mid_load_cursor[bucket_size];
+                auto& out = mid_load_per_size[bucket_size];
                 uint64_t prev_pos_in_seq = constants::invalid_uint64;
                 for (auto const& mt : bucket_buf) {
                     if (mt.pos_in_seq != prev_pos_in_seq) {
-                        mid_load_buckets_builder.set(cursor++, mt.pos_in_seq);
+                        const uint64_t v = mt.pos_in_seq;
+                        out.write(reinterpret_cast<char const*>(&v), sizeof(uint64_t));
                         prev_pos_in_seq = mt.pos_in_seq;
                     }
                 }
-                mid_load_cursor[bucket_size] = cursor;
             } else {
-                /* Heavy: write positions at the monotone cursor, set the
-                   codeword (encodes the start offset and partition id),
-                   and emit kmer-extraction requests for each super-kmer
-                   in the bucket. */
+                /* Heavy: write positions at the monotone heavy_load_cursor,
+                   set the codeword (encodes the start offset and partition
+                   id), and emit kmer-extraction requests for each
+                   super-kmer in the bucket. */
                 const uint64_t partition_id = partition_for_size(bucket_size);
                 assert(partition_id < num_partitions);
                 const uint64_t bucket_begin = heavy_load_cursor;
                 const uint64_t code = (((bucket_begin << 3) | partition_id) << 2) | 3;
                 assert(code < (uint64_t(1) << num_bits_for_control));
-                control_codewords_builder.set(bucket_id, code);
+                control_codewords_writer.set(bucket_id, code);
 
                 uint32_t pos_in_bucket = uint32_t(-1);
                 uint64_t prev_pos_in_seq = constants::invalid_uint64;
                 for (auto const& mt : bucket_buf) {
                     num_kmers_in_partition[partition_id] += mt.num_kmers_in_super_kmer;
                     if (mt.pos_in_seq != prev_pos_in_seq) {
-                        heavy_load_buckets_builder.set(heavy_load_cursor++, mt.pos_in_seq);
+                        heavy_load_writer.set(heavy_load_cursor++, mt.pos_in_seq);
                         prev_pos_in_seq = mt.pos_in_seq;
                         ++pos_in_bucket;
                     }
@@ -387,11 +432,47 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         flush_request_buffer();
     }
 
-    /* Build sparse-index structures into the dictionary. */
-    d.m_ssi.begin_buckets_of_size = std::move(begin_buckets_of_size);
-    control_codewords_builder.build(d.m_ssi.codewords.control_codewords);
-    mid_load_buckets_builder.build(d.m_ssi.mid_load_buckets);
-    heavy_load_buckets_builder.build(d.m_ssi.ski.heavy_load_buckets);
+    /* Finalize the directly-streamed compact_vector files. */
+    control_codewords_writer.finalize();
+    heavy_load_writer.finalize();
+
+    /* Close per-size mid_load files. */
+    for (uint64_t s = 2; s <= min_size; ++s) {
+        if (mid_load_per_size[s].is_open()) mid_load_per_size[s].close();
+    }
+
+    /* Concatenate per-size mid_load files in size order into the final
+       mid_load_buckets compact_vector file via the streaming writer.
+       Each per-size file holds raw uint64_t values written monotonically
+       within its size class; we just stream them through, packing into
+       num_bits_per_offset-bit fields at the precomputed begin offset for
+       each size. */
+    {
+        streaming_compact_vector_writer mid_load_writer;
+        mid_load_writer.open(spilled.mid_load_buckets_path,
+                             num_minimizer_positions_of_buckets_larger_than_1,
+                             num_bits_per_offset);
+        uint64_t global_index = 0;
+        for (uint64_t s = 2; s <= min_size; ++s) {
+            const uint64_t expected = buckets_stats.num_buckets_of_size(s) * s;
+            if (expected == 0) continue;
+            std::ifstream in(mid_load_per_size_path(s), std::ifstream::binary);
+            if (!in.is_open()) {
+                throw std::runtime_error("cannot reopen mid_load per-size tmp file");
+            }
+            for (uint64_t i = 0; i != expected; ++i) {
+                uint64_t v;
+                in.read(reinterpret_cast<char*>(&v), sizeof(uint64_t));
+                if (in.gcount() != static_cast<std::streamsize>(sizeof(uint64_t))) {
+                    throw std::runtime_error("mid_load per-size tmp file truncated");
+                }
+                mid_load_writer.set(global_index++, v);
+            }
+            in.close();
+            std::remove(mid_load_per_size_path(s).c_str());
+        }
+        mid_load_writer.finalize();
+    }
 
     timer.stop();
     build_stats.add("step 7.1 (build sparse index)", uint64_t(timer.elapsed()));
@@ -411,8 +492,8 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         Phases (B) and (C) below; phase (A) was folded into the combined
         sparse pass above. Phase (B) extracts k-mers from `strings` in a
         single forward sweep guided by the externally-sorted requests, and
-        phase (C) builds the per-partition MPHF + positions in external
-        memory from the per-partition kmer files.
+        phase (C) builds the per-partition MPHF + cvb_positions on disk
+        from the per-partition kmer files.
     */
     timer.start();
 
@@ -476,8 +557,6 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
                     kmer = std::min(kmer, kmer_rc);
                 }
                 auto& w = partition_writers[req.partition_id];
-                /* write only `kmer.bits` (avoids serializing the vptr that
-                   `uint_kmer_t` carries due to its virtual destructor) */
                 w.write(reinterpret_cast<char const*>(&kmer.bits), sizeof(kmer.bits));
                 w.write(reinterpret_cast<char const*>(&req.pos_in_bucket),
                         sizeof(req.pos_in_bucket));
@@ -496,27 +575,55 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         }
     }
 
-    /* (C) per-partition MPHF + positions build */
+    /*
+        (C) per-partition MPHF + cvb_positions build, both on disk.
+
+        Per partition:
+          (1) Build MPHF in external memory by streaming the partition's
+              kmer file (pthash spills hashes to tmp_dirname under its own
+              ram budget).
+          (2) Stream-read the kmer file, compute F(kmer), emit
+              (F(kmer), pos_in_bucket) tuples to disk; external-sort by
+              F(kmer); stream sorted tuples through a
+              streaming_compact_vector_writer to produce the partition's
+              cvb_positions tmp file.
+
+        Only the MPHF itself is held in RAM (pthash returns it as an
+        in-memory struct); cvb_positions is fully spilled.
+    */
     {
+        spilled.skew_positions_paths.assign(num_partitions, std::string());
         std::vector<kmers_pthash_type<Kmer>> mphfs;
-        std::vector<bits::compact_vector> positions;
         mphfs.resize(num_partitions);
-        positions.resize(num_partitions);
 
         pthash::build_configuration mphf_build_config;
-        mphf_build_config.lambda =
-            build_config.lambda + 2.0; /* Use higher lambda here since we have less keys. */
+        mphf_build_config.lambda = build_config.lambda + 2.0;
         mphf_build_config.alpha = 0.94;
         mphf_build_config.seed = util::get_seed_for_hash_function(build_config);
         mphf_build_config.verbose = false;
         mphf_build_config.num_threads = build_config.num_threads;
         mphf_build_config.avg_partition_size = constants::avg_partition_size;
-        /* External-memory PHF: bound RAM by `--ram-limit` and spill hashes
-           to `tmp_dirname` rather than holding the partition's keys
-           (~16 B/kmer) and their hashes simultaneously in RAM. */
         mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2;
         mphf_build_config.tmp_dir = build_config.tmp_dirname;
 
+        const uint64_t pos_run_basename_id = pthash::clock_type::now().time_since_epoch().count();
+        auto pos_run_filename = [&](uint64_t partition_id, uint64_t id) {
+            std::stringstream ss;
+            ss << build_config.tmp_dirname << "/sshash.tmp.run_" << pos_run_basename_id
+               << ".pos_runs.p" << partition_id << "." << id << ".bin";
+            return ss.str();
+        };
+        auto skew_positions_filename = [&](uint64_t partition_id) {
+            std::stringstream ss;
+            ss << build_config.tmp_dirname << "/sshash.tmp.run_" << pos_run_basename_id
+               << ".skew_positions.p" << partition_id << ".bin";
+            return ss.str();
+        };
+
+        const uint64_t pos_buffer_capacity = std::max<uint64_t>(
+            uint64_t(1) << 16,
+            (build_config.ram_limit_in_GiB * essentials::GiB) / (4 * sizeof(position_tuple)));
+
         uint64_t lower = min_size;
         uint64_t upper = 2 * lower;
         uint64_t num_bits_per_pos = constants::min_l + 1;
@@ -537,7 +644,7 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
 
             if (n > 0)  //
             {
-                const std::string fn = skew_partition_filename(partition_id);
+                const std::string kmer_fn = skew_partition_filename(partition_id);
 
                 if (build_config.verbose) {
                     const uint64_t avg_partition_size =
@@ -552,14 +659,11 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
                               << ")..." << std::endl;
                 }
 
-                /* (1) Build the MPHF by streaming kmers from the partition
-                       file. pthash's external-memory builder spills hashes
-                       to tmp_dir under its own RAM budget; the iterator's
-                       footprint is constant. */
+                /* (1) Build the MPHF by streaming kmers from the partition file. */
                 auto& F = mphfs[partition_id];
                 {
                     skew_partition_kmer_iterator<Kmer> iter;
-                    iter.open(fn);
+                    iter.open(kmer_fn);
                     F.build_in_external_memory(iter, n, mphf_build_config);
                     iter.close();
                 }
@@ -570,14 +674,31 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
                               << static_cast<double>(F.num_bits()) / F.num_keys() << std::endl;
                 }
 
-                /* (2) Re-stream the file to fill cvb_positions: for each
-                       (kmer, pos_in_bucket), set cvb_positions[F(kmer)] =
-                       pos_in_bucket. Only cvb_positions itself stays in RAM
-                       (n * num_bits_per_pos bits, the actual stored output). */
-                bits::compact_vector::builder cvb_positions;
-                cvb_positions.resize(n, num_bits_per_pos);
+                /* (2a) Stream-read kmer file, compute F(kmer), externally
+                       sort (F(kmer), pos_in_bucket) tuples by F(kmer). */
+                std::atomic<uint64_t> pos_num_runs{0};
                 {
-                    std::ifstream in(fn, std::ifstream::binary);
+                    std::vector<position_tuple> pos_buffer;
+                    pos_buffer.reserve(pos_buffer_capacity);
+                    auto flush_pos_buffer = [&]() {
+                        if (pos_buffer.empty()) return;
+                        parallel_sort(pos_buffer, build_config.num_threads,
+                                      [](position_tuple const& a, position_tuple const& b) {
+                                          return a.mphf_pos < b.mphf_pos;
+                                      });
+                        const uint64_t id = pos_num_runs.fetch_add(1);
+                        std::ofstream out(pos_run_filename(partition_id, id),
+                                          std::ofstream::binary);
+                        if (!out.is_open()) {
+                            throw std::runtime_error("cannot open positions tuple run file");
+                        }
+                        out.write(reinterpret_cast<char const*>(pos_buffer.data()),
+                                  pos_buffer.size() * sizeof(position_tuple));
+                        out.close();
+                        pos_buffer.clear();
+                    };
+
+                    std::ifstream in(kmer_fn, std::ifstream::binary);
                     if (!in.is_open()) {
                         throw std::runtime_error("cannot open skew-partition tmp file");
                     }
@@ -586,19 +707,54 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
                         in.read(reinterpret_cast<char*>(&kmer.bits), sizeof(kmer.bits));
                         uint32_t pib;
                         in.read(reinterpret_cast<char*>(&pib), sizeof(pib));
-                        cvb_positions.set(F(kmer), pib);
+                        const uint64_t pos = F(kmer);
+                        if (pos_buffer.size() == pos_buffer_capacity) flush_pos_buffer();
+                        pos_buffer.emplace_back(pos, pib);
                     }
                     in.close();
+                    std::remove(kmer_fn.c_str());
+                    flush_pos_buffer();
                 }
-                std::remove(fn.c_str());
 
-                auto& P = positions[partition_id];
-                cvb_positions.build(P);
+                /* (2b) Stream sorted tuples through the streaming
+                       compact_vector writer to produce the partition's
+                       cvb_positions tmp file. */
+                {
+                    spilled.skew_positions_paths[partition_id] =
+                        skew_positions_filename(partition_id);
+                    streaming_compact_vector_writer pos_writer;
+                    pos_writer.open(spilled.skew_positions_paths[partition_id], n,
+                                    num_bits_per_pos);
+
+                    struct pos_run_names_iterator {
+                        pos_run_names_iterator(uint64_t partition_id,
+                                               std::function<std::string(uint64_t, uint64_t)> fn)
+                            : i(0), partition_id(partition_id), fn(std::move(fn)) {}
+                        std::string operator*() { return fn(partition_id, i); }
+                        void operator++() { ++i; }
+                        uint64_t i;
+                        uint64_t partition_id;
+                        std::function<std::string(uint64_t, uint64_t)> fn;
+                    };
+                    pos_run_names_iterator names_it(partition_id, pos_run_filename);
+                    file_merging_iterator<position_tuple> merger(names_it, pos_num_runs.load());
+                    while (merger.has_next()) {
+                        position_tuple pt = *merger;
+                        pos_writer.set(pt.mphf_pos, pt.pib);
+                        merger.next();
+                    }
+                    merger.close();
+                    pos_writer.finalize();
+                }
+
+                /* Cleanup the position-tuple run files. */
+                for (uint64_t i = 0; i != pos_num_runs.load(); ++i) {
+                    std::remove(pos_run_filename(partition_id, i).c_str());
+                }
 
                 if (build_config.verbose) {
-                    std::cout << "    built positions[" << partition_id << "] for " << P.size()
-                              << " kmers; bits/key = " << (P.num_bytes() * 8.0) / P.size()
-                              << std::endl;
+                    std::cout << "    built positions[" << partition_id << "] for " << n
+                              << " kmers; bits/key = " << num_bits_per_pos << std::endl;
                 }
             }
 
@@ -613,7 +769,8 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         }
 
         d.m_ssi.ski.mphfs = std::move(mphfs);
-        d.m_ssi.ski.positions = std::move(positions);
+        /* d.m_ssi.ski.positions stays empty here; it will be populated
+           either by step 8 (materialize) or substituted at stream-save. */
     }
 
     timer.stop();

From 2c73e09050bf48a7504de954d375a53a1d0db816 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 5 May 2026 17:27:56 +0000
Subject: [PATCH 12/32] spill the codewords + per-skew-partition MPHFs to disk

The accumulating in-RAM MPHFs were the dominant resident
memory at the end of phase C. For the HPRC k=63 m=31 canonical
benchmark (5.9 B kmers, ~5.4 B in skew at ~3 bits/key) they
sum to ~2 GB held simultaneously through save.

This commit spills them to disk and concatenates at save:

  - Step 5 (hash_minimizers): the codewords MPHF is no longer
    needed after the minimizer values are remapped. Save it to
    a tmp file via essentials::save and default-assign the
    in-RAM struct to free it.

  - Step 7.2 phase C: after each partition's cvb_positions has
    been written to disk, save that partition's MPHF to a tmp
    file and default-assign it. Subsequent partitions never
    coexist with prior partitions' MPHFs.

  - The streaming saver gains an address+type-keyed
    substitution map (typed_address_sub). Type discrimination
    is necessary because in C++ a struct's address coincides
    with the address of its first member when the struct has
    standard layout, so address alone is ambiguous (visiting
    sparse_and_skew_index would otherwise match a substitution
    registered for its first member's first member, the
    codewords MPHF).

  - dictionary_builder uses a `register_sub<T>` helper that
    captures the type via typeid for each registration. The
    saver only fires the substitution when both address and
    std::type_index(typeid(T)) match.

  - Materializing build() flow loads each spilled MPHF back
    via essentials::loader so queries work afterward.

Verified byte-identical output vs the previous commit on
salmonella_enterica m=7, plus full --check on regular,
--canonical, multi-thread (-t 4), and --weighted, plus a
streaming-save round-trip (sshash check + sshash query) with
all five "EVERYTHING OK!" suites and 100% positive matches.
No tmp file leaks.

After this commit, the proportional-to-input items in RAM
during build are bounded by --ram-limit:
  - pthash external-memory builder (capped at ram_limit/2),
  - one current MPHF being built (~bits/key * partition_size),
  - per-step external-sort buffers (capped),
  - the stored compact_vectors, MPHFs, etc. all spill.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 include/builder/dictionary_builder.hpp      |  86 ++++++++++++----
 include/builder/streaming_save.hpp          | 103 +++++++++++++-------
 src/builder/build_sparse_and_skew_index.cpp |  15 +++
 3 files changed, 153 insertions(+), 51 deletions(-)

diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp
index 7e4af93..58a3ab1 100644
--- a/include/builder/dictionary_builder.hpp
+++ b/include/builder/dictionary_builder.hpp
@@ -36,14 +36,20 @@ struct spilled_components {
     std::string mid_load_buckets_path;
     std::string heavy_load_buckets_path;
     std::vector<std::string> skew_positions_paths;  // one entry per skew partition
+    std::string codewords_mphf_path;                // step-4 minimizers MPHF
+    std::vector<std::string> skew_mphfs_paths;      // one entry per skew partition
 
     void clear_files() {
         if (!control_codewords_path.empty()) std::remove(control_codewords_path.c_str());
         if (!mid_load_buckets_path.empty()) std::remove(mid_load_buckets_path.c_str());
         if (!heavy_load_buckets_path.empty()) std::remove(heavy_load_buckets_path.c_str());
+        if (!codewords_mphf_path.empty()) std::remove(codewords_mphf_path.c_str());
         for (auto const& p : skew_positions_paths) {
             if (!p.empty()) std::remove(p.c_str());
         }
+        for (auto const& p : skew_mphfs_paths) {
+            if (!p.empty()) std::remove(p.c_str());
+        }
     }
 };
 
@@ -93,28 +99,46 @@ struct dictionary_builder  //
     {
         run_steps_1_through_7(d, filename);
         do_step("step 8 (stream-save dictionary to disk)", [&]() {
-            /* Populate placeholder compact_vectors at the visit slots whose
-               byte content the saver will substitute from disk tmp files. */
-            std::unordered_map<bits::compact_vector const*, std::string> subs;
+            /* Address+type-keyed substitution map. The saver replaces the
+               visit byte content of any registered (address, type) pair
+               with the bytes of the corresponding tmp file. Type matching
+               disambiguates the case where a struct's address coincides
+               with the address of its first member. */
+            std::unordered_map<void const*, typed_address_sub> subs;
             if (!spilled.control_codewords_path.empty()) {
-                subs[&d.m_ssi.codewords.control_codewords] = spilled.control_codewords_path;
+                register_sub(subs, &d.m_ssi.codewords.control_codewords,
+                             spilled.control_codewords_path);
             }
             if (!spilled.mid_load_buckets_path.empty()) {
-                subs[&d.m_ssi.mid_load_buckets] = spilled.mid_load_buckets_path;
+                register_sub(subs, &d.m_ssi.mid_load_buckets, spilled.mid_load_buckets_path);
             }
             if (!spilled.heavy_load_buckets_path.empty()) {
-                subs[&d.m_ssi.ski.heavy_load_buckets] = spilled.heavy_load_buckets_path;
+                register_sub(subs, &d.m_ssi.ski.heavy_load_buckets,
+                             spilled.heavy_load_buckets_path);
+            }
+            if (!spilled.codewords_mphf_path.empty()) {
+                register_sub(subs, &d.m_ssi.codewords.mphf, spilled.codewords_mphf_path);
             }
-            /* skew positions: populate the owning_span with placeholders so
-               the visit walks the right number of entries and we can take
-               their addresses for substitution. */
-            const std::size_t num_part = spilled.skew_positions_paths.size();
+            /* Skew positions / mphfs: populate the owning_spans with
+               placeholders so the visit walks the right number of entries
+               and we can take their addresses for substitution. */
+            const std::size_t num_part = std::max(spilled.skew_positions_paths.size(),
+                                                  spilled.skew_mphfs_paths.size());
             if (num_part > 0) {
-                std::vector<bits::compact_vector> placeholders(num_part);
-                d.m_ssi.ski.positions = std::move(placeholders);
-                for (std::size_t i = 0; i != num_part; ++i) {
+                std::vector<bits::compact_vector> position_placeholders(num_part);
+                std::vector<kmers_pthash_type<Kmer>> mphf_placeholders(num_part);
+                d.m_ssi.ski.positions = std::move(position_placeholders);
+                d.m_ssi.ski.mphfs = std::move(mphf_placeholders);
+                for (std::size_t i = 0; i != spilled.skew_positions_paths.size(); ++i) {
                     if (!spilled.skew_positions_paths[i].empty()) {
-                        subs[&d.m_ssi.ski.positions[i]] = spilled.skew_positions_paths[i];
+                        register_sub(subs, &d.m_ssi.ski.positions[i],
+                                     spilled.skew_positions_paths[i]);
+                    }
+                }
+                for (std::size_t i = 0; i != spilled.skew_mphfs_paths.size(); ++i) {
+                    if (!spilled.skew_mphfs_paths[i].empty()) {
+                        register_sub(subs, &d.m_ssi.ski.mphfs[i],
+                                     spilled.skew_mphfs_paths[i]);
                     }
                 }
             }
@@ -157,16 +181,30 @@ struct dictionary_builder  //
             materialize_compact_vector_from_file(d.m_ssi.ski.heavy_load_buckets,
                                                  spilled.heavy_load_buckets_path);
         }
-        const std::size_t num_part = spilled.skew_positions_paths.size();
+        /* Reload the spilled MPHFs back into RAM so queries work. */
+        if (!spilled.codewords_mphf_path.empty()) {
+            essentials::loader loader(spilled.codewords_mphf_path.c_str());
+            loader.visit(d.m_ssi.codewords.mphf);
+        }
+        const std::size_t num_part = std::max(spilled.skew_positions_paths.size(),
+                                              spilled.skew_mphfs_paths.size());
         if (num_part > 0) {
             std::vector<bits::compact_vector> positions_vec(num_part);
-            for (std::size_t i = 0; i != num_part; ++i) {
+            std::vector<kmers_pthash_type<Kmer>> mphfs_vec(num_part);
+            for (std::size_t i = 0; i != spilled.skew_positions_paths.size(); ++i) {
                 if (!spilled.skew_positions_paths[i].empty()) {
                     materialize_compact_vector_from_file(positions_vec[i],
                                                          spilled.skew_positions_paths[i]);
                 }
             }
+            for (std::size_t i = 0; i != spilled.skew_mphfs_paths.size(); ++i) {
+                if (!spilled.skew_mphfs_paths[i].empty()) {
+                    essentials::loader loader(spilled.skew_mphfs_paths[i].c_str());
+                    loader.visit(mphfs_vec[i]);
+                }
+            }
             d.m_ssi.ski.positions = std::move(positions_vec);
+            d.m_ssi.ski.mphfs = std::move(mphfs_vec);
         }
     }
 
@@ -288,7 +326,8 @@ struct dictionary_builder  //
         std::string filename = minimizers.get_minimizers_filename();
         std::ifstream input(filename, std::ifstream::binary);
 
-        auto const& f = d.m_ssi.codewords.mphf;
+        auto& f_mut = d.m_ssi.codewords.mphf;
+        auto const& f = f_mut;
         const uint64_t num_threads = build_config.num_threads;
         const uint64_t num_files_to_merge = minimizers.num_files_to_merge();
 
@@ -343,6 +382,19 @@ struct dictionary_builder  //
         }
 
         input.close();
+
+        /* The codewords MPHF is no longer needed during build (step 6 onward
+           reads minimizer values that step 5 has already replaced with
+           mphf hashes; step 7 references mphf hashes only as bucket ids).
+           Spill it to disk and free its in-RAM footprint. */
+        {
+            std::stringstream ss;
+            ss << build_config.tmp_dirname << "/sshash.tmp.run_" << strings_run_id
+               << ".codewords_mphf.bin";
+            spilled.codewords_mphf_path = ss.str();
+            essentials::save(f_mut, spilled.codewords_mphf_path.c_str());
+            f_mut = minimizers_pthash_type{};
+        }
     }
 };
 
diff --git a/include/builder/streaming_save.hpp b/include/builder/streaming_save.hpp
index 9e9de1d..7ccc87c 100644
--- a/include/builder/streaming_save.hpp
+++ b/include/builder/streaming_save.hpp
@@ -5,40 +5,63 @@
 #include <stdexcept>
 #include <string>
 #include <type_traits>
+#include <typeindex>
+#include <typeinfo>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "essentials.hpp"
 #include "external/pthash/external/bits/include/bit_vector.hpp"
-#include "external/pthash/external/bits/include/compact_vector.hpp"
 
 #include "include/builder/disk_backed_strings.hpp"
 
 namespace sshash {
 
 /*
-    A saver that mirrors `essentials::generic_saver`, except that any visit
-    to a specific `bits::bit_vector` instance (identified by address) is
-    redirected to `disk_backed_strings::save_to`, which streams the strings
-    bytes from the on-disk tmp file. Likewise, visits to `bits::compact_vector`
-    instances whose addresses appear in `compact_vector_subs` are replaced
-    with byte-for-byte streaming from the corresponding tmp file (which is
-    expected to be in `bits::compact_vector::visit_impl`'s on-disk format).
-
-    Address-based identification means we don't need to add any intermediate
-    type or marker to bits::bit_vector / bits::compact_vector themselves.
+    A typed substitution: the saver replaces the visit byte content of an
+    object at `address` with the bytes of `filename` only if the visited
+    type T satisfies `std::type_index(typeid(T)) == type`.
+
+    Type discrimination is necessary because in C++ a struct's address
+    coincides with the address of its first member when the struct has
+    standard layout. Without the type check, a substitution registered
+    for an inner field would also fire (incorrectly) on every enclosing
+    parent that shares its address.
+*/
+struct typed_address_sub {
+    std::string filename;
+    std::type_index type;
+};
+
+/*
+    A saver that mirrors `essentials::generic_saver`, except for two
+    interception mechanisms used during streaming save:
+
+    1. The `bits::bit_vector` instance whose address matches `strings_addr`
+       has its bytes streamed from `strings_storage` (which writes the
+       same on-disk format `bits::bit_vector::visit_impl` produces).
+
+    2. Any object whose address appears in `address_subs` has its visit
+       byte content replaced by a copy of the corresponding tmp file.
+       This is type-agnostic — it works for `bits::compact_vector`, for
+       pthash MPHFs, or anything else whose serialized form has been
+       saved to a file via `essentials::save`.
+
+    The substitution check is performed at the start of every visit<T>
+    call (whatever T is); if no match, the call falls through to the
+    regular `essentials::generic_saver` logic (POD via save_pod, or
+    recursion via val.visit(*this)).
 */
 struct streaming_strings_saver {
-    streaming_strings_saver(
-        std::ostream& os,                                                                //
-        bits::bit_vector const* strings_addr,                                            //
-        disk_backed_strings const* strings_storage,                                      //
-        std::unordered_map<bits::compact_vector const*, std::string> compact_vector_subs //
-        )
+    streaming_strings_saver(std::ostream& os,                                       //
+                            bits::bit_vector const* strings_addr,                   //
+                            disk_backed_strings const* strings_storage,             //
+                            std::unordered_map<void const*, typed_address_sub> address_subs)
         : m_os(os)
         , m_strings_addr(strings_addr)
         , m_strings_storage(strings_storage)
-        , m_compact_vector_subs(std::move(compact_vector_subs)) {
+        , m_address_subs(std::move(address_subs)) {
         if (m_strings_addr == nullptr || m_strings_storage == nullptr) {
             throw std::runtime_error("streaming_strings_saver requires non-null arguments");
         }
@@ -46,19 +69,25 @@ struct streaming_strings_saver {
 
     template <typename T>
     void visit(T const& val) {
+        /* Type+address substitution (compact_vectors, MPHFs, etc.).
+           Both must match: address alone is ambiguous when a struct
+           shares its address with its first member. */
+        void const* addr = static_cast<void const*>(&val);
+        auto it = m_address_subs.find(addr);
+        if (it != m_address_subs.end() && it->second.type == std::type_index(typeid(T))) {
+            stream_file_into_os(it->second.filename);
+            return;
+        }
+        /* Strings: dedicated callback because the on-disk strings file
+           holds raw words (not the bits::bit_vector serialized form);
+           `disk_backed_strings::save_to(os)` writes the visit_impl format
+           on the fly. */
         if constexpr (std::is_same_v<T, bits::bit_vector>) {
             if (&val == m_strings_addr) {
                 m_strings_storage->save_to(m_os);
                 return;
             }
         }
-        if constexpr (std::is_same_v<T, bits::compact_vector>) {
-            auto it = m_compact_vector_subs.find(&val);
-            if (it != m_compact_vector_subs.end()) {
-                stream_file_into_os(it->second);
-                return;
-            }
-        }
         if constexpr (essentials::is_pod<T>::value) {
             essentials::save_pod(m_os, val);
         } else {
@@ -82,7 +111,7 @@ struct streaming_strings_saver {
     std::ostream& m_os;
     bits::bit_vector const* m_strings_addr;
     disk_backed_strings const* m_strings_storage;
-    std::unordered_map<bits::compact_vector const*, std::string> m_compact_vector_subs;
+    std::unordered_map<void const*, typed_address_sub> m_address_subs;
 
     template <typename Vec>
     void visit_seq(Vec const& vec) {
@@ -115,25 +144,31 @@ struct streaming_strings_saver {
 /*
     Save `t` to `filename`. Any embedded bits::bit_vector matching
     `strings_addr` is streamed from `strings_storage`; any embedded
-    bits::compact_vector whose address appears in `compact_vector_subs`
-    has its bytes copied from the corresponding tmp file. Other fields are
-    saved via the standard essentials path.
+    object whose address appears in `address_subs` has its bytes copied
+    from the corresponding tmp file. Other fields are saved via the
+    standard essentials path.
 */
 template <typename T>
 void save_streaming(T const& t, char const* filename,                                  //
                     bits::bit_vector const* strings_addr,                              //
                     disk_backed_strings const& strings_storage,                        //
-                    std::unordered_map<bits::compact_vector const*, std::string>       //
-                        compact_vector_subs = {})                                      //
-{
+                    std::unordered_map<void const*, typed_address_sub> address_subs = {}) {
     std::ofstream out(filename, std::ios::binary);
     if (!out.good()) {
         throw std::runtime_error(std::string("error opening file '") + filename + "' for writing");
     }
-    streaming_strings_saver saver(out, strings_addr, &strings_storage,
-                                  std::move(compact_vector_subs));
+    streaming_strings_saver saver(out, strings_addr, &strings_storage, std::move(address_subs));
     saver.visit(t);
     out.close();
 }
 
+/* Helper: register a typed substitution at the address of `addr`. */
+template <typename T>
+inline void register_sub(std::unordered_map<void const*, typed_address_sub>& subs,
+                         T const* addr, std::string filename) {
+    subs.insert_or_assign(static_cast<void const*>(addr),
+                          typed_address_sub{std::move(filename),
+                                            std::type_index(typeid(T))});
+}
+
 }  // namespace sshash
diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp
index 3e9851d..d4deba3 100644
--- a/src/builder/build_sparse_and_skew_index.cpp
+++ b/src/builder/build_sparse_and_skew_index.cpp
@@ -593,6 +593,7 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     */
     {
         spilled.skew_positions_paths.assign(num_partitions, std::string());
+        spilled.skew_mphfs_paths.assign(num_partitions, std::string());
         std::vector<kmers_pthash_type<Kmer>> mphfs;
         mphfs.resize(num_partitions);
 
@@ -619,6 +620,12 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
                << ".skew_positions.p" << partition_id << ".bin";
             return ss.str();
         };
+        auto skew_mphf_filename = [&](uint64_t partition_id) {
+            std::stringstream ss;
+            ss << build_config.tmp_dirname << "/sshash.tmp.run_" << pos_run_basename_id
+               << ".skew_mphf.p" << partition_id << ".bin";
+            return ss.str();
+        };
 
         const uint64_t pos_buffer_capacity = std::max<uint64_t>(
             uint64_t(1) << 16,
@@ -756,6 +763,14 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
                     std::cout << "    built positions[" << partition_id << "] for " << n
                               << " kmers; bits/key = " << num_bits_per_pos << std::endl;
                 }
+
+                /* Spill the partition's MPHF to disk (no longer needed
+                   during build) and free its in-RAM footprint. The
+                   accumulating skew MPHFs were the dominant resident
+                   memory at the end of phase C. */
+                spilled.skew_mphfs_paths[partition_id] = skew_mphf_filename(partition_id);
+                essentials::save(F, spilled.skew_mphfs_paths[partition_id].c_str());
+                F = kmers_pthash_type<Kmer>{};
             }
 
             /* advance partition state for the next iteration */

From dddee47de79dcd72717b9107eed4b9d86a7b2ed5 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 5 May 2026 18:01:23 +0000
Subject: [PATCH 13/32] cap pthash mphf num_threads by --ram-limit

pthash's `partitioned_phf::build` builds the partitioned MPHF's
sub-partitions in parallel, with `mphf_build_config.num_threads`
sub-partitions running simultaneously. Each per-partition build
allocates a `pairs_t` of ~`avg_partition_size * 16 B` (~48 MB
with the default avg_partition_size = 3M); with -t 64 that
balloons to ~3 GB of pthash internal memory, dominating the
build's RSS regardless of how aggressively sshash spills its
own structures.

Add `util::cap_mphf_num_threads(requested, ram_limit_in_GiB)`:
budget ~ram_limit/4 GiB for pthash's per-partition build memory
and conservatively assume 64 MiB per parallel sub-partition
(48 MB pairs_t + sort temporary + slack). Apply at the two
pthash build sites (step 4 codewords MPHF, step 7.2 phase C
skew partition MPHFs).

For -g 2 -t 64 this caps pthash to 8 threads (1 GiB-pthash
budget over 64 MiB-per-thread = 16 capped further by /4 budget
fraction = 8). Build time may increase modestly; peak RSS
should drop well below the previous ~3.3 GB toward the 2 GiB
target.

Verified byte-identical output on salmonella_enterica m=7,
plus full --check on regular, --canonical, multi-thread (-t 4),
and --weighted.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 include/minimizers_control_map.hpp          |  3 ++-
 include/util.hpp                            | 22 +++++++++++++++++++++
 src/builder/build_sparse_and_skew_index.cpp |  3 ++-
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/include/minimizers_control_map.hpp b/include/minimizers_control_map.hpp
index 834d607..889917b 100644
--- a/include/minimizers_control_map.hpp
+++ b/include/minimizers_control_map.hpp
@@ -13,7 +13,8 @@ struct minimizers_control_map  //
         mphf_build_config.alpha = 0.94;
         mphf_build_config.seed = util::get_seed_for_hash_function(build_config);
         mphf_build_config.verbose = false;
-        mphf_build_config.num_threads = build_config.num_threads;
+        mphf_build_config.num_threads =
+            util::cap_mphf_num_threads(build_config.num_threads, build_config.ram_limit_in_GiB);
         mphf_build_config.avg_partition_size = constants::avg_partition_size;
         mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2;
         mphf_build_config.tmp_dir = build_config.tmp_dirname;
diff --git a/include/util.hpp b/include/util.hpp
index bf9bebd..fc93013 100644
--- a/include/util.hpp
+++ b/include/util.hpp
@@ -199,6 +199,28 @@ static inline uint64_t get_seed_for_hash_function(build_configuration const& bui
     return build_config.seed != my_favourite_seed ? my_favourite_seed : ~my_favourite_seed;
 }
 
+/*
+    Cap pthash's `num_threads` so its `partitioned_phf::build` parallelism
+    fits in the user's --ram-limit budget.
+
+    pthash builds sub-partitions of the partitioned MPHF in parallel; each
+    sub-partition allocates a `pairs_t` vector of roughly
+    `avg_partition_size * sizeof(pair)` bytes during `map`/sort. With the
+    default `avg_partition_size = 3,000,000` and ~16 B/pair this is on
+    the order of ~48 MB per thread; conservatively budget 64 MB per
+    parallel sub-partition (covers the sort temporary + small constants).
+
+    With `--ram-limit = G` GiB we allow pthash up to `G/4` GiB for this
+    parallel build memory, capping pthash threads accordingly.
+*/
+static inline uint64_t cap_mphf_num_threads(uint64_t requested_num_threads,
+                                            uint64_t ram_limit_in_GiB) {
+    constexpr uint64_t per_thread_estimate_bytes = uint64_t(64) << 20;  // 64 MiB
+    const uint64_t budget_bytes = (ram_limit_in_GiB * essentials::GiB) / 4;
+    const uint64_t max_parallel = std::max<uint64_t>(1, budget_bytes / per_thread_estimate_bytes);
+    return std::min<uint64_t>(requested_num_threads, max_parallel);
+}
+
 [[maybe_unused]] static bool ends_with(std::string const& str, std::string const& pattern) {
     if (pattern.size() > str.size()) return false;
     return std::equal(pattern.begin(), pattern.end(), str.end() - pattern.size());
diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp
index d4deba3..eca3498 100644
--- a/src/builder/build_sparse_and_skew_index.cpp
+++ b/src/builder/build_sparse_and_skew_index.cpp
@@ -602,7 +602,8 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         mphf_build_config.alpha = 0.94;
         mphf_build_config.seed = util::get_seed_for_hash_function(build_config);
         mphf_build_config.verbose = false;
-        mphf_build_config.num_threads = build_config.num_threads;
+        mphf_build_config.num_threads =
+            util::cap_mphf_num_threads(build_config.num_threads, build_config.ram_limit_in_GiB);
         mphf_build_config.avg_partition_size = constants::avg_partition_size;
         mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2;
         mphf_build_config.tmp_dir = build_config.tmp_dirname;

From e18b9b4146dd5f8c0f55466dd772fdfe4c34219d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 5 May 2026 18:06:22 +0000
Subject: [PATCH 14/32] mphf thread cap: only kick in when budget would
 actually be exceeded
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous cap formula (ram_limit/4) was too aggressive — it
fired even at default -g 8 -t 64 where capping isn't needed.
That silently changed pthash's parallelism on configurations
the user didn't intend to be tight, breaking expectations of
"-t means t threads".

Loosen the budget to ram_limit/2: leave roughly half of
--ram-limit available to pthash's parallel sub-partition build
memory (the other half covers the streaming buffers, external
sort buffers, etc.). Cap pthash threads only when the user's
-t would push pthash past that half-budget. Common cases now
pass through unchanged:

  -g 8  -t  64    no cap (4 GiB / 64 MiB = 64 >= 64)
  -g 4  -t  32    no cap
  -g 2  -t  16    no cap
  -g 16 -t 128    no cap
  -g 2  -t  64    cap to 16 (pathological: tight budget vs many threads)
  -g 4  -t  64    cap to 32

When the cap fires, log a clear warning naming the MPHF, the
requested thread count, the cap, and the budget. Verbose mode
only.

Verified byte-identical output and full --check matrix still
pass.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 include/minimizers_control_map.hpp          |  5 ++-
 include/util.hpp                            | 43 ++++++++++++++-------
 src/builder/build_sparse_and_skew_index.cpp |  5 ++-
 3 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/include/minimizers_control_map.hpp b/include/minimizers_control_map.hpp
index 889917b..9db52d6 100644
--- a/include/minimizers_control_map.hpp
+++ b/include/minimizers_control_map.hpp
@@ -13,8 +13,9 @@ struct minimizers_control_map  //
         mphf_build_config.alpha = 0.94;
         mphf_build_config.seed = util::get_seed_for_hash_function(build_config);
         mphf_build_config.verbose = false;
-        mphf_build_config.num_threads =
-            util::cap_mphf_num_threads(build_config.num_threads, build_config.ram_limit_in_GiB);
+        mphf_build_config.num_threads = util::cap_mphf_num_threads(
+            build_config.num_threads, build_config.ram_limit_in_GiB, build_config.verbose,
+            "minimizers MPHF");
         mphf_build_config.avg_partition_size = constants::avg_partition_size;
         mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2;
         mphf_build_config.tmp_dir = build_config.tmp_dirname;
diff --git a/include/util.hpp b/include/util.hpp
index fc93013..7ab56dd 100644
--- a/include/util.hpp
+++ b/include/util.hpp
@@ -200,25 +200,38 @@ static inline uint64_t get_seed_for_hash_function(build_configuration const& bui
 }
 
 /*
-    Cap pthash's `num_threads` so its `partitioned_phf::build` parallelism
-    fits in the user's --ram-limit budget.
-
-    pthash builds sub-partitions of the partitioned MPHF in parallel; each
-    sub-partition allocates a `pairs_t` vector of roughly
-    `avg_partition_size * sizeof(pair)` bytes during `map`/sort. With the
-    default `avg_partition_size = 3,000,000` and ~16 B/pair this is on
-    the order of ~48 MB per thread; conservatively budget 64 MB per
-    parallel sub-partition (covers the sort temporary + small constants).
-
-    With `--ram-limit = G` GiB we allow pthash up to `G/4` GiB for this
-    parallel build memory, capping pthash threads accordingly.
+    Cap pthash's `num_threads` only when leaving it equal to the user's
+    `-t` would push the build past `--ram-limit`.
+
+    pthash's `partitioned_phf::build` builds the partitioned MPHF's
+    sub-partitions in parallel; each sub-partition allocates a `pairs_t`
+    of roughly `avg_partition_size * sizeof(pair)` bytes during
+    `map`/sort. With the default `avg_partition_size = 3,000,000` this is
+    on the order of ~48 MB per thread; we conservatively budget 64 MiB
+    per parallel sub-partition (covers the sort temporary + slack).
+
+    The other build steps (the streaming buffers, the per-step external
+    sort buffers, etc.) use up to roughly half of `--ram-limit`, so we
+    leave the other half available to pthash. Cap pthash threads so that
+    `64 MiB * threads <= ram_limit/2`. If the user's `-t` already fits,
+    we don't touch it: this only kicks in for pathologically tight
+    budgets (small `--ram-limit` combined with large `-t`).
 */
 static inline uint64_t cap_mphf_num_threads(uint64_t requested_num_threads,
-                                            uint64_t ram_limit_in_GiB) {
+                                            uint64_t ram_limit_in_GiB,
+                                            bool verbose,
+                                            char const* mphf_name) {
     constexpr uint64_t per_thread_estimate_bytes = uint64_t(64) << 20;  // 64 MiB
-    const uint64_t budget_bytes = (ram_limit_in_GiB * essentials::GiB) / 4;
+    const uint64_t budget_bytes = (ram_limit_in_GiB * essentials::GiB) / 2;
     const uint64_t max_parallel = std::max<uint64_t>(1, budget_bytes / per_thread_estimate_bytes);
-    return std::min<uint64_t>(requested_num_threads, max_parallel);
+    if (requested_num_threads <= max_parallel) return requested_num_threads;
+    if (verbose) {
+        std::cout << "  --> WARNING: capping pthread mphf threads for " << mphf_name
+                  << " from " << requested_num_threads << " to " << max_parallel
+                  << " to fit --ram-limit=" << ram_limit_in_GiB << " GiB"
+                  << " (pthash uses ~64 MiB per parallel sub-partition build)" << std::endl;
+    }
+    return max_parallel;
 }
 
 [[maybe_unused]] static bool ends_with(std::string const& str, std::string const& pattern) {
diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp
index eca3498..8e7e267 100644
--- a/src/builder/build_sparse_and_skew_index.cpp
+++ b/src/builder/build_sparse_and_skew_index.cpp
@@ -602,8 +602,9 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         mphf_build_config.alpha = 0.94;
         mphf_build_config.seed = util::get_seed_for_hash_function(build_config);
         mphf_build_config.verbose = false;
-        mphf_build_config.num_threads =
-            util::cap_mphf_num_threads(build_config.num_threads, build_config.ram_limit_in_GiB);
+        mphf_build_config.num_threads = util::cap_mphf_num_threads(
+            build_config.num_threads, build_config.ram_limit_in_GiB, build_config.verbose,
+            "skew partition MPHF");
         mphf_build_config.avg_partition_size = constants::avg_partition_size;
         mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2;
         mphf_build_config.tmp_dir = build_config.tmp_dirname;

From 869f901d5ba6952d0566ce5e9c05f251bf709884 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 5 May 2026 18:17:16 +0000
Subject: [PATCH 15/32] mphf: scale avg_partition_size to honor -t; cap only
 when pathological

Previous approach quietly capped pthash's num_threads under
tight --ram-limit, which broke the user's expectation that "-t
N means N threads". The actual lever pthash exposes for
per-thread memory is `avg_partition_size`: per-partition build
memory is roughly `avg_partition_size * sizeof(pair)` bytes.

Replace the unconditional thread cap with a configuration that
divides the pthash RAM budget (half of --ram-limit) evenly
across the user's requested threads, then derives an
`avg_partition_size` for that per-thread budget:

  - per_thread_budget = (ram_limit / 2) / num_threads
  - avg = per_thread_budget / per_key_estimate (32 B)
  - if avg >= default (3M):       use default; -t honored
  - elif avg >= floor (100K):     use avg; -t honored
  - else:                         cap -t so floor fits, warn

The warning only fires in the pathological case (so many
threads at so little RAM that even pthash's quality floor
can't fit). Common configurations pass through unchanged:

  -t 1   -g 8     no change (default partition 3M)
  -t 64  -g 8     -t honored, partition 2M
  -t 64  -g 4     -t honored, partition 1M
  -t 64  -g 2     -t honored, partition 524K
  -t 256 -g 1     warn + cap to 167 (-t couldn't be honored)

Verified byte-identical output on -t 1 builds (the default
single-thread case) and full --check matrix on regular,
--canonical, multi-thread, and --weighted.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 include/minimizers_control_map.hpp          |  7 +-
 include/util.hpp                            | 83 ++++++++++++++-------
 src/builder/build_sparse_and_skew_index.cpp |  8 +-
 3 files changed, 63 insertions(+), 35 deletions(-)

diff --git a/include/minimizers_control_map.hpp b/include/minimizers_control_map.hpp
index 9db52d6..5665cf9 100644
--- a/include/minimizers_control_map.hpp
+++ b/include/minimizers_control_map.hpp
@@ -13,10 +13,9 @@ struct minimizers_control_map  //
         mphf_build_config.alpha = 0.94;
         mphf_build_config.seed = util::get_seed_for_hash_function(build_config);
         mphf_build_config.verbose = false;
-        mphf_build_config.num_threads = util::cap_mphf_num_threads(
-            build_config.num_threads, build_config.ram_limit_in_GiB, build_config.verbose,
-            "minimizers MPHF");
-        mphf_build_config.avg_partition_size = constants::avg_partition_size;
+        util::configure_mphf_threads_and_partition(mphf_build_config, build_config.num_threads,
+                                                   build_config.ram_limit_in_GiB,
+                                                   build_config.verbose, "minimizers MPHF");
         mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2;
         mphf_build_config.tmp_dir = build_config.tmp_dirname;
 
diff --git a/include/util.hpp b/include/util.hpp
index 7ab56dd..e45574f 100644
--- a/include/util.hpp
+++ b/include/util.hpp
@@ -200,38 +200,67 @@ static inline uint64_t get_seed_for_hash_function(build_configuration const& bui
 }
 
 /*
-    Cap pthash's `num_threads` only when leaving it equal to the user's
-    `-t` would push the build past `--ram-limit`.
+    Configure pthash's `num_threads` and `avg_partition_size` so that the
+    parallel sub-partition build memory fits in the user's --ram-limit
+    without unilaterally reducing the user's requested thread count.
 
     pthash's `partitioned_phf::build` builds the partitioned MPHF's
     sub-partitions in parallel; each sub-partition allocates a `pairs_t`
-    of roughly `avg_partition_size * sizeof(pair)` bytes during
-    `map`/sort. With the default `avg_partition_size = 3,000,000` this is
-    on the order of ~48 MB per thread; we conservatively budget 64 MiB
-    per parallel sub-partition (covers the sort temporary + slack).
-
-    The other build steps (the streaming buffers, the per-step external
-    sort buffers, etc.) use up to roughly half of `--ram-limit`, so we
-    leave the other half available to pthash. Cap pthash threads so that
-    `64 MiB * threads <= ram_limit/2`. If the user's `-t` already fits,
-    we don't touch it: this only kicks in for pathologically tight
-    budgets (small `--ram-limit` combined with large `-t`).
+    of roughly `avg_partition_size * sizeof(pair)` bytes during the
+    `map` + sort step. So per-thread peak ≈ `avg_partition_size *
+    per_key_bytes`. We can scale `avg_partition_size` down to fit any
+    desired per-thread budget — the only floor is pthash's hash-search
+    quality, for which `avg_partition_size` should not go below ~100k.
+
+    Strategy: split half of `--ram-limit` evenly across the requested
+    threads (the other half covers sshash's own buffers). For each
+    thread compute `per_thread_budget`, derive a candidate
+    `avg_partition_size`, and use it (clamped at the default upper end
+    so we never make partitions larger than usual). Only when the
+    derived `avg_partition_size` falls below the floor do we fall back
+    to capping threads — in that case we emit a warning naming the MPHF
+    so the user knows the requested -t couldn't be honored.
 */
-static inline uint64_t cap_mphf_num_threads(uint64_t requested_num_threads,
-                                            uint64_t ram_limit_in_GiB,
-                                            bool verbose,
-                                            char const* mphf_name) {
-    constexpr uint64_t per_thread_estimate_bytes = uint64_t(64) << 20;  // 64 MiB
-    const uint64_t budget_bytes = (ram_limit_in_GiB * essentials::GiB) / 2;
-    const uint64_t max_parallel = std::max<uint64_t>(1, budget_bytes / per_thread_estimate_bytes);
-    if (requested_num_threads <= max_parallel) return requested_num_threads;
-    if (verbose) {
-        std::cout << "  --> WARNING: capping pthread mphf threads for " << mphf_name
-                  << " from " << requested_num_threads << " to " << max_parallel
-                  << " to fit --ram-limit=" << ram_limit_in_GiB << " GiB"
-                  << " (pthash uses ~64 MiB per parallel sub-partition build)" << std::endl;
+static inline void configure_mphf_threads_and_partition(
+    pthash::build_configuration& mphf,             //
+    uint64_t requested_num_threads,                //
+    uint64_t ram_limit_in_GiB,                     //
+    bool verbose,                                  //
+    char const* mphf_name)                         //
+{
+    constexpr uint64_t per_key_bytes = 32;          // pairs_t entry + sort slack
+    constexpr uint64_t min_avg_partition_size = uint64_t(100) * 1000;
+    const uint64_t default_avg = constants::avg_partition_size;
+
+    const uint64_t pthash_ram = (ram_limit_in_GiB * essentials::GiB) / 2;
+    const uint64_t per_thread =
+        pthash_ram / std::max<uint64_t>(1, requested_num_threads);
+    const uint64_t avg_for_thread_budget = per_thread / per_key_bytes;
+
+    if (avg_for_thread_budget >= default_avg) {
+        /* Plenty of RAM per thread — keep the default partition size. */
+        mphf.num_threads = requested_num_threads;
+        mphf.avg_partition_size = default_avg;
+    } else if (avg_for_thread_budget >= min_avg_partition_size) {
+        /* Tighter per-thread budget: shrink partitions to fit; threads
+           honored. */
+        mphf.num_threads = requested_num_threads;
+        mphf.avg_partition_size = avg_for_thread_budget;
+    } else {
+        /* Pathological: not enough RAM per thread even at the floor.
+           Cap threads so the floor fits. */
+        const uint64_t max_threads = std::max<uint64_t>(
+            1, pthash_ram / (per_key_bytes * min_avg_partition_size));
+        if (verbose) {
+            std::cout << "  --> WARNING: not enough RAM per thread for " << mphf_name
+                      << " (--ram-limit=" << ram_limit_in_GiB << " GiB, "
+                      << requested_num_threads << " requested threads): capping to "
+                      << max_threads << " threads at min partition size "
+                      << min_avg_partition_size << std::endl;
+        }
+        mphf.num_threads = max_threads;
+        mphf.avg_partition_size = min_avg_partition_size;
     }
-    return max_parallel;
 }
 
 [[maybe_unused]] static bool ends_with(std::string const& str, std::string const& pattern) {
diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp
index 8e7e267..aa8ea08 100644
--- a/src/builder/build_sparse_and_skew_index.cpp
+++ b/src/builder/build_sparse_and_skew_index.cpp
@@ -602,10 +602,10 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         mphf_build_config.alpha = 0.94;
         mphf_build_config.seed = util::get_seed_for_hash_function(build_config);
         mphf_build_config.verbose = false;
-        mphf_build_config.num_threads = util::cap_mphf_num_threads(
-            build_config.num_threads, build_config.ram_limit_in_GiB, build_config.verbose,
-            "skew partition MPHF");
-        mphf_build_config.avg_partition_size = constants::avg_partition_size;
+        util::configure_mphf_threads_and_partition(mphf_build_config, build_config.num_threads,
+                                                   build_config.ram_limit_in_GiB,
+                                                   build_config.verbose,
+                                                   "skew partition MPHF");
         mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2;
         mphf_build_config.tmp_dir = build_config.tmp_dirname;
 

From 8e4b0d8e3c543c1e9d70db82c10a224f30902913 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 5 May 2026 18:41:40 +0000
Subject: [PATCH 16/32] clamp --ram-limit to a 4 GiB floor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Below ~4 GiB the streaming buffers + pthash's internal
working memory (which we don't fully control) can't
practically be made to fit; squeezing further has diminishing
returns. Rather than degrade the build into ever-tinier
buffers and ever-smaller pthash partition sizes, treat 4 GiB
as the effective floor — a modest requirement on today's
desktops.

Add `constants::min_ram_limit_in_GiB = 4` and apply it in the
single validation/normalization step at the build entrypoint
(both `dictionary::build` and `dictionary::build_streaming_save`).
A NOTE is printed in verbose mode whenever the user's `-g` is
raised. Configurations at or above the floor are unaffected.

Verified byte-identical output on the default build, full
--check on regular, --canonical, multi-thread, and --weighted.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 include/constants.hpp |  5 +++++
 src/builder/build.cpp | 27 +++++++++++++++++++++------
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/include/constants.hpp b/include/constants.hpp
index 80f20fa..a020a6a 100644
--- a/include/constants.hpp
+++ b/include/constants.hpp
@@ -4,6 +4,11 @@ namespace sshash::constants {
 
 constexpr uint64_t invalid_uint64 = uint64_t(-1);
 constexpr uint64_t default_ram_limit_in_GiB = 8;
+/* Floor on --ram-limit. Below this the build's streaming buffers + pthash's
+   internal working memory can't usefully be made to fit; rather than degrade
+   further at very tight budgets, we clamp `-g` to at least this value
+   (modest by today's desktop standards). */
+constexpr uint64_t min_ram_limit_in_GiB = 4;
 constexpr uint64_t seed = 1;
 
 /* for PTHash */
diff --git a/src/builder/build.cpp b/src/builder/build.cpp
index 76d0b97..f1354d9 100644
--- a/src/builder/build.cpp
+++ b/src/builder/build.cpp
@@ -8,8 +8,8 @@ namespace sshash {
 
 namespace {
 
-inline void validate_build_config_or_throw(build_configuration const& bc, uint64_t max_k,
-                                           uint64_t max_m) {
+inline void validate_and_normalize_build_config(build_configuration& bc, uint64_t max_k,
+                                                uint64_t max_m) {
     if (bc.k == 0) throw std::runtime_error("k must be > 0");
     if (bc.k > max_k) {
         throw std::runtime_error("k must be less <= " + std::to_string(max_k) +
@@ -21,6 +21,19 @@ inline void validate_build_config_or_throw(build_configuration const& bc, uint64
                                  " but got m = " + std::to_string(bc.m));
     }
     if (bc.m > bc.k) throw std::runtime_error("m must be <= k");
+
+    /* Clamp --ram-limit to the floor. Below this, the streaming buffers
+       plus pthash's internal working memory can't usefully be made to
+       fit; rather than try to squeeze further we treat the floor as the
+       effective budget. */
+    if (bc.ram_limit_in_GiB < constants::min_ram_limit_in_GiB) {
+        if (bc.verbose) {
+            std::cout << "  --> NOTE: --ram-limit raised from " << bc.ram_limit_in_GiB
+                      << " GiB to the floor of " << constants::min_ram_limit_in_GiB << " GiB"
+                      << std::endl;
+        }
+        bc.ram_limit_in_GiB = constants::min_ram_limit_in_GiB;
+    }
 }
 
 }  // namespace
@@ -29,8 +42,9 @@ template <typename Kmer, typename Offsets>
 void dictionary<Kmer, Offsets>::build(std::string const& filename,
                                       build_configuration const& build_config)  //
 {
-    validate_build_config_or_throw(build_config, Kmer::max_k, Kmer::max_m);
-    dictionary_builder<Kmer, Offsets> builder(build_config);
+    build_configuration bc = build_config;
+    validate_and_normalize_build_config(bc, Kmer::max_k, Kmer::max_m);
+    dictionary_builder<Kmer, Offsets> builder(bc);
     builder.build(*this, filename);
 }
 
@@ -39,8 +53,9 @@ void dictionary<Kmer, Offsets>::build_streaming_save(
     std::string const& input_filename, build_configuration const& build_config,
     std::string const& output_filename)  //
 {
-    validate_build_config_or_throw(build_config, Kmer::max_k, Kmer::max_m);
-    dictionary_builder<Kmer, Offsets> builder(build_config);
+    build_configuration bc = build_config;
+    validate_and_normalize_build_config(bc, Kmer::max_k, Kmer::max_m);
+    dictionary_builder<Kmer, Offsets> builder(bc);
     builder.build_streaming_save(*this, input_filename, output_filename);
 }
 

From f68fa779d63bedb4d8a2e3cf49d12bc372834b7d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 6 May 2026 08:23:16 +0000
Subject: [PATCH 17/32] bump pthash to claude/fix-pthash-memory-estimate-NhgTI

Picks up two commits on the pthash branch:

  90ace87  Account for input hash vector in construction memory estimate
  9600827  Drop the redundant in-search hashes term and the comment

The fix corrects pthash's `estimate_num_bytes_for_construction`,
which underestimated per-partition residency by `num_keys *
sizeof(hash_type)` (= 16 B/key for hash128) and by an extra
`num_keys * 8 B` double-counted "in-search hashes" term. With
the corrected estimate, pthash's `bytes < config.ram` flush
gate in the parallel partitioned-PHF build path actually
matches residency, so `mphf_build_config.ram = ram_limit/2`
will now bind pthash's parallel build batch to that budget on
inputs like HPRC k=63 m=31 canonical (where the previous
underestimate had pthash's batch peak around 1.5x the
configured budget).

Verified byte-identical SSHash output before/after the bump
on salmonella_enterica m=7 (the change is purely a
memory-accounting fix). Full --check matrix passes.

When pthash master receives this fix we'll bump again to
that tip; for now we point at the branch tip directly so the
HPRC benchmark can validate the new RSS bound.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 external/pthash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/pthash b/external/pthash
index e04a192..9600827 160000
--- a/external/pthash
+++ b/external/pthash
@@ -1 +1 @@
-Subproject commit e04a1920ffeae9e7d876acd0362cab79605f7af3
+Subproject commit 960082760bdf2e7315c0b827e747acb84a2c7c99

From c550d532069948bd8f0b1b3a699b8cb8bfe37bf2 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 6 May 2026 08:44:01 +0000
Subject: [PATCH 18/32] tighten ram-proportional buffer caps from ram/4 to
 ram/8
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the pthash memory-estimate fix, pthash's parallel-build
batch correctly binds at `ram_limit/2`. The remaining excess
on the HPRC k=63 m=31 -g 4 -t 64 benchmark was sshash's own
buffers (each currently sized at ram_limit/4) piled on top:

  - step 5 minimizer-tuples buffer: up to RAM_available/3
    (~1.1 GB at -g 4) — and freed but typically retained by
    glibc, so it lingers in process RSS through subsequent
    steps.
  - step 7.1 kmer-extraction request buffer: ram_limit/4
    (1 GiB at -g 4).
  - step 7.2 phase C position-tuple buffer: ram_limit/4
    (1 GiB at -g 4) — alive concurrently with pthash's
    parallel-build memory and the partition's MPHF.

Halve them all (cap at ram_limit/8 = 512 MiB at -g 4). The
external sorts get more flush rounds and slightly more disk
I/O, but peak RSS during the heaviest step (7.2 phase C)
drops by roughly:

  pthash 2 GiB  +  pos buffer 0.5 GiB  +  partition MPHF 0.3 GB
  +  step-5 lingering 0.5 GiB  ≈  3.3 GiB

vs the prior ~4.8 GiB observed peak. Should fit comfortably
under -g 4.

Verified byte-identical SSHash output on salmonella_enterica
m=7 (more flush rounds = more intermediate run files but the
sort+merge is order-stable for the bytes we care about).

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 include/builder/dictionary_builder.hpp      | 16 +++++++++++++---
 src/builder/build_sparse_and_skew_index.cpp | 11 ++++++++---
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp
index 58a3ab1..b3e2b34 100644
--- a/include/builder/dictionary_builder.hpp
+++ b/include/builder/dictionary_builder.hpp
@@ -347,9 +347,19 @@ struct dictionary_builder  //
         }
 
         const uint64_t num_super_kmers = minimizers.num_super_kmers();
-        const uint64_t buffer_size = num_files_to_merge == 1
-                                         ? num_super_kmers
-                                         : (RAM_available_in_bytes / (3 * sizeof(minimizer_tuple)));
+        /* Cap the in-RAM buffer at ram_limit/8 worth of tuples so that
+           even when subsequent steps fragment the heap, step 5's lingering
+           pages don't blow past the budget when stacked with later step's
+           allocations. */
+        const uint64_t buffer_cap_bytes =
+            (build_config.ram_limit_in_GiB * essentials::GiB) / 8;
+        const uint64_t buffer_cap_records =
+            std::max<uint64_t>(uint64_t(1) << 16, buffer_cap_bytes / sizeof(minimizer_tuple));
+        const uint64_t buffer_size_unbounded =
+            num_files_to_merge == 1
+                ? num_super_kmers
+                : (RAM_available_in_bytes / (3 * sizeof(minimizer_tuple)));
+        const uint64_t buffer_size = std::min(buffer_size_unbounded, buffer_cap_records);
         const uint64_t num_blocks = (num_super_kmers + buffer_size - 1) / buffer_size;
         assert(num_super_kmers > (num_blocks - 1) * buffer_size);
 
diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp
index aa8ea08..30ecdd5 100644
--- a/src/builder/build_sparse_and_skew_index.cpp
+++ b/src/builder/build_sparse_and_skew_index.cpp
@@ -307,12 +307,14 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     };
 
     /* External-sort buffer for kmer-extraction requests (formerly step 7.2
-       phase A; now folded into the combined pass). */
+       phase A; now folded into the combined pass). Capped at ram_limit/8
+       so heap fragmentation across steps doesn't push peak RSS past the
+       --ram-limit budget. */
     std::atomic<uint64_t> num_request_runs{0};
     const uint64_t request_buffer_capacity = std::max<uint64_t>(
         uint64_t(1) << 16,
         (build_config.ram_limit_in_GiB * essentials::GiB) /
-            (4 * sizeof(kmer_extraction_request)));
+            (8 * sizeof(kmer_extraction_request)));
     std::vector<kmer_extraction_request> request_buffer;
     request_buffer.reserve(request_buffer_capacity);
     auto flush_request_buffer = [&]() {
@@ -629,9 +631,12 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
             return ss.str();
         };
 
+        /* Capped at ram_limit/8: this buffer is alive during phase C
+           alongside pthash's parallel-build memory and the currently-
+           building partition's MPHF, so it has to share the RAM budget. */
         const uint64_t pos_buffer_capacity = std::max<uint64_t>(
             uint64_t(1) << 16,
-            (build_config.ram_limit_in_GiB * essentials::GiB) / (4 * sizeof(position_tuple)));
+            (build_config.ram_limit_in_GiB * essentials::GiB) / (8 * sizeof(position_tuple)));
 
         uint64_t lower = min_size;
         uint64_t upper = 2 * lower;

From 365758b7871b9ea69ddc6d886eae04d8b14fa074 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 6 May 2026 09:19:00 +0000
Subject: [PATCH 19/32] bump pthash submodule to master tip a95e814

PR #97 ("fix pthash memory estimate") landed in pthash master.
This bump moves us from the (now-merged) branch tip 9600827
to the master squash-merge commit a95e814. The substantive
content of `internal_memory_builder_single_phf.hpp` is
byte-identical between the two, so SSHash's behavior is
unchanged from f68fa77.

This unpins us from the development branch URL and lets us
track pthash master going forward.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 external/pthash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/pthash b/external/pthash
index 9600827..a95e814 160000
--- a/external/pthash
+++ b/external/pthash
@@ -1 +1 @@
-Subproject commit 960082760bdf2e7315c0b827e747acb84a2c7c99
+Subproject commit a95e8147a8ba1fa33b57fa24de7b5e674423e9a7

From 526b64bb6f37d4629ad88c16db0d4d0547187ee4 Mon Sep 17 00:00:00 2001
From: jermp <giulioermanno.pibiri@unive.it>
Date: Wed, 6 May 2026 16:17:04 +0200
Subject: [PATCH 20/32] clang format

---
 include/builder/dictionary_builder.hpp        | 25 +++++-----
 .../builder/disk_backed_offsets_builder.hpp   |  7 +--
 include/builder/disk_backed_strings.hpp       | 23 +++++----
 .../streaming_compact_vector_writer.hpp       | 10 ++--
 include/builder/streaming_save.hpp            | 19 ++++----
 src/builder/build.cpp                         |  6 +--
 src/builder/build_sparse_and_skew_index.cpp   | 47 ++++++++-----------
 7 files changed, 60 insertions(+), 77 deletions(-)

diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp
index b3e2b34..4cf8d0c 100644
--- a/include/builder/dictionary_builder.hpp
+++ b/include/builder/dictionary_builder.hpp
@@ -93,9 +93,9 @@ struct dictionary_builder  //
         caller only needs the on-disk index file and wants to keep peak RAM
         bounded by the build phase.
     */
-    void build_streaming_save(dictionary<Kmer, Offsets>& d,                  //
-                              std::string const& filename,                   //
-                              std::string const& output_filename)            //
+    void build_streaming_save(dictionary<Kmer, Offsets>& d,        //
+                              std::string const& filename,         //
+                              std::string const& output_filename)  //
     {
         run_steps_1_through_7(d, filename);
         do_step("step 8 (stream-save dictionary to disk)", [&]() {
@@ -122,8 +122,8 @@ struct dictionary_builder  //
             /* Skew positions / mphfs: populate the owning_spans with
                placeholders so the visit walks the right number of entries
                and we can take their addresses for substitution. */
-            const std::size_t num_part = std::max(spilled.skew_positions_paths.size(),
-                                                  spilled.skew_mphfs_paths.size());
+            const std::size_t num_part =
+                std::max(spilled.skew_positions_paths.size(), spilled.skew_mphfs_paths.size());
             if (num_part > 0) {
                 std::vector<bits::compact_vector> position_placeholders(num_part);
                 std::vector<kmers_pthash_type<Kmer>> mphf_placeholders(num_part);
@@ -137,8 +137,7 @@ struct dictionary_builder  //
                 }
                 for (std::size_t i = 0; i != spilled.skew_mphfs_paths.size(); ++i) {
                     if (!spilled.skew_mphfs_paths[i].empty()) {
-                        register_sub(subs, &d.m_ssi.ski.mphfs[i],
-                                     spilled.skew_mphfs_paths[i]);
+                        register_sub(subs, &d.m_ssi.ski.mphfs[i], spilled.skew_mphfs_paths[i]);
                     }
                 }
             }
@@ -186,8 +185,8 @@ struct dictionary_builder  //
             essentials::loader loader(spilled.codewords_mphf_path.c_str());
             loader.visit(d.m_ssi.codewords.mphf);
         }
-        const std::size_t num_part = std::max(spilled.skew_positions_paths.size(),
-                                              spilled.skew_mphfs_paths.size());
+        const std::size_t num_part =
+            std::max(spilled.skew_positions_paths.size(), spilled.skew_mphfs_paths.size());
         if (num_part > 0) {
             std::vector<bits::compact_vector> positions_vec(num_part);
             std::vector<kmers_pthash_type<Kmer>> mphfs_vec(num_part);
@@ -351,14 +350,12 @@ struct dictionary_builder  //
            even when subsequent steps fragment the heap, step 5's lingering
            pages don't blow past the budget when stacked with later step's
            allocations. */
-        const uint64_t buffer_cap_bytes =
-            (build_config.ram_limit_in_GiB * essentials::GiB) / 8;
+        const uint64_t buffer_cap_bytes = (build_config.ram_limit_in_GiB * essentials::GiB) / 8;
         const uint64_t buffer_cap_records =
             std::max<uint64_t>(uint64_t(1) << 16, buffer_cap_bytes / sizeof(minimizer_tuple));
         const uint64_t buffer_size_unbounded =
-            num_files_to_merge == 1
-                ? num_super_kmers
-                : (RAM_available_in_bytes / (3 * sizeof(minimizer_tuple)));
+            num_files_to_merge == 1 ? num_super_kmers
+                                    : (RAM_available_in_bytes / (3 * sizeof(minimizer_tuple)));
         const uint64_t buffer_size = std::min(buffer_size_unbounded, buffer_cap_records);
         const uint64_t num_blocks = (num_super_kmers + buffer_size - 1) / buffer_size;
         assert(num_super_kmers > (num_blocks - 1) * buffer_size);
diff --git a/include/builder/disk_backed_offsets_builder.hpp b/include/builder/disk_backed_offsets_builder.hpp
index c86d33f..6be3562 100644
--- a/include/builder/disk_backed_offsets_builder.hpp
+++ b/include/builder/disk_backed_offsets_builder.hpp
@@ -96,9 +96,7 @@ struct disk_backed_offsets_builder {
     std::string const& filename() const { return m_filename; }
 
     /* In-RAM footprint of the builder (excluding the on-disk file). */
-    uint64_t num_bytes() const {
-        return sizeof(m_nb) + m_buf.capacity() * sizeof(uint64_t);
-    }
+    uint64_t num_bytes() const { return sizeof(m_nb) + m_buf.capacity() * sizeof(uint64_t); }
 
     void set_num_bits(num_bits nb) { m_nb = nb; }
 
@@ -151,8 +149,7 @@ struct disk_backed_offsets_builder {
             if (!m_in.is_open()) {
                 throw std::runtime_error("cannot open offsets tmp file '" + filename + "'");
             }
-            m_in.seekg(static_cast<std::streamoff>(start_index * sizeof(uint64_t)),
-                       std::ios::beg);
+            m_in.seekg(static_cast<std::streamoff>(start_index * sizeof(uint64_t)), std::ios::beg);
             refill();
         }
 
diff --git a/include/builder/disk_backed_strings.hpp b/include/builder/disk_backed_strings.hpp
index dabafe9..d267d4a 100644
--- a/include/builder/disk_backed_strings.hpp
+++ b/include/builder/disk_backed_strings.hpp
@@ -153,17 +153,17 @@ struct disk_backed_strings {
             const uint64_t block = bit_pos >> 6;
             const uint64_t shift = bit_pos & 63;
             ensure_window_covers(block);
-            uint64_t a = (block >= m_window_start_word &&
-                          block < m_window_start_word + m_window_size)
-                             ? m_window[block - m_window_start_word]
-                             : uint64_t(0);
+            uint64_t a =
+                (block >= m_window_start_word && block < m_window_start_word + m_window_size)
+                    ? m_window[block - m_window_start_word]
+                    : uint64_t(0);
             uint64_t word = a >> shift;
             if (shift) {
                 const uint64_t next = block + 1;
-                uint64_t b = (next >= m_window_start_word &&
-                              next < m_window_start_word + m_window_size)
-                                 ? m_window[next - m_window_start_word]
-                                 : uint64_t(0);
+                uint64_t b =
+                    (next >= m_window_start_word && next < m_window_start_word + m_window_size)
+                        ? m_window[next - m_window_start_word]
+                        : uint64_t(0);
                 word |= b << (64 - shift);
             }
             return word;
@@ -185,8 +185,7 @@ struct disk_backed_strings {
                 return;
             }
             m_in.clear();  // clear any prior eof
-            m_in.seekg(static_cast<std::streamoff>(target_word * sizeof(uint64_t)),
-                       std::ios::beg);
+            m_in.seekg(static_cast<std::streamoff>(target_word * sizeof(uint64_t)), std::ios::beg);
             const uint64_t to_read = std::min(m_window_capacity, m_total_words - target_word);
             m_in.read(reinterpret_cast<char*>(m_window.data()),
                       static_cast<std::streamsize>(to_read * sizeof(uint64_t)));
@@ -264,8 +263,8 @@ struct disk_backed_strings {
         std::vector<char> buffer(uint64_t(64) << 10);  // 64 KiB
         uint64_t bytes_remaining = total_words * sizeof(uint64_t);
         while (bytes_remaining > 0) {
-            const std::streamsize chunk = static_cast<std::streamsize>(
-                std::min<uint64_t>(buffer.size(), bytes_remaining));
+            const std::streamsize chunk =
+                static_cast<std::streamsize>(std::min<uint64_t>(buffer.size(), bytes_remaining));
             in.read(buffer.data(), chunk);
             const std::streamsize got = in.gcount();
             if (got <= 0) {
diff --git a/include/builder/streaming_compact_vector_writer.hpp b/include/builder/streaming_compact_vector_writer.hpp
index ea9dce1..e36e83b 100644
--- a/include/builder/streaming_compact_vector_writer.hpp
+++ b/include/builder/streaming_compact_vector_writer.hpp
@@ -32,8 +32,10 @@ struct streaming_compact_vector_writer {
     streaming_compact_vector_writer& operator=(streaming_compact_vector_writer const&) = delete;
 
     void open(std::string const& filename, uint64_t num_entries, uint64_t width) {
-        if (width == 0) throw std::runtime_error("streaming_compact_vector_writer: width must be > 0");
-        if (width > 64) throw std::runtime_error("streaming_compact_vector_writer: width must be <= 64");
+        if (width == 0)
+            throw std::runtime_error("streaming_compact_vector_writer: width must be > 0");
+        if (width > 64)
+            throw std::runtime_error("streaming_compact_vector_writer: width must be <= 64");
         m_filename = filename;
         m_num_entries = num_entries;
         m_width = width;
@@ -66,9 +68,7 @@ struct streaming_compact_vector_writer {
     /* Write a value at position `index`. Successive calls must satisfy
        `index >= previous_index`; gaps are filled with zero. */
     void set(uint64_t index, uint64_t value) {
-        if (m_have_last_index) {
-            assert(index >= m_last_index);
-        }
+        if (m_have_last_index) { assert(index >= m_last_index); }
         m_have_last_index = true;
         m_last_index = index;
 
diff --git a/include/builder/streaming_save.hpp b/include/builder/streaming_save.hpp
index 7ccc87c..78db027 100644
--- a/include/builder/streaming_save.hpp
+++ b/include/builder/streaming_save.hpp
@@ -54,9 +54,9 @@ struct typed_address_sub {
     recursion via val.visit(*this)).
 */
 struct streaming_strings_saver {
-    streaming_strings_saver(std::ostream& os,                                       //
-                            bits::bit_vector const* strings_addr,                   //
-                            disk_backed_strings const* strings_storage,             //
+    streaming_strings_saver(std::ostream& os,                            //
+                            bits::bit_vector const* strings_addr,        //
+                            disk_backed_strings const* strings_storage,  //
                             std::unordered_map<void const*, typed_address_sub> address_subs)
         : m_os(os)
         , m_strings_addr(strings_addr)
@@ -149,9 +149,9 @@ struct streaming_strings_saver {
     standard essentials path.
 */
 template <typename T>
-void save_streaming(T const& t, char const* filename,                                  //
-                    bits::bit_vector const* strings_addr,                              //
-                    disk_backed_strings const& strings_storage,                        //
+void save_streaming(T const& t, char const* filename,            //
+                    bits::bit_vector const* strings_addr,        //
+                    disk_backed_strings const& strings_storage,  //
                     std::unordered_map<void const*, typed_address_sub> address_subs = {}) {
     std::ofstream out(filename, std::ios::binary);
     if (!out.good()) {
@@ -164,11 +164,10 @@ void save_streaming(T const& t, char const* filename,
 
 /* Helper: register a typed substitution at the address of `addr`. */
 template <typename T>
-inline void register_sub(std::unordered_map<void const*, typed_address_sub>& subs,
-                         T const* addr, std::string filename) {
+inline void register_sub(std::unordered_map<void const*, typed_address_sub>& subs, T const* addr,
+                         std::string filename) {
     subs.insert_or_assign(static_cast<void const*>(addr),
-                          typed_address_sub{std::move(filename),
-                                            std::type_index(typeid(T))});
+                          typed_address_sub{std::move(filename), std::type_index(typeid(T))});
 }
 
 }  // namespace sshash
diff --git a/src/builder/build.cpp b/src/builder/build.cpp
index f1354d9..0f6210b 100644
--- a/src/builder/build.cpp
+++ b/src/builder/build.cpp
@@ -49,9 +49,9 @@ void dictionary<Kmer, Offsets>::build(std::string const& filename,
 }
 
 template <typename Kmer, typename Offsets>
-void dictionary<Kmer, Offsets>::build_streaming_save(
-    std::string const& input_filename, build_configuration const& build_config,
-    std::string const& output_filename)  //
+void dictionary<Kmer, Offsets>::build_streaming_save(std::string const& input_filename,
+                                                     build_configuration const& build_config,
+                                                     std::string const& output_filename)  //
 {
     build_configuration bc = build_config;
     validate_and_normalize_build_config(bc, Kmer::max_k, Kmer::max_m);
diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp
index 30ecdd5..a861093 100644
--- a/src/builder/build_sparse_and_skew_index.cpp
+++ b/src/builder/build_sparse_and_skew_index.cpp
@@ -14,19 +14,15 @@ namespace sshash {
 #pragma pack(push, 4)
 struct kmer_extraction_request {
     kmer_extraction_request() {}
-    kmer_extraction_request(uint64_t starting_pos, uint32_t partition_id,
-                            uint32_t pos_in_bucket, uint32_t num_kmers_in_super_kmer)
+    kmer_extraction_request(uint64_t starting_pos, uint32_t partition_id, uint32_t pos_in_bucket,
+                            uint32_t num_kmers_in_super_kmer)
         : starting_pos(starting_pos)
         , partition_id(partition_id)
         , pos_in_bucket(pos_in_bucket)
         , num_kmers_in_super_kmer(num_kmers_in_super_kmer) {}
 
-    bool operator<(kmer_extraction_request const& o) const {
-        return starting_pos < o.starting_pos;
-    }
-    bool operator>(kmer_extraction_request const& o) const {
-        return starting_pos > o.starting_pos;
-    }
+    bool operator<(kmer_extraction_request const& o) const { return starting_pos < o.starting_pos; }
+    bool operator>(kmer_extraction_request const& o) const { return starting_pos > o.starting_pos; }
 
     static kmer_extraction_request max() {
         return kmer_extraction_request(uint64_t(-1), uint32_t(-1), uint32_t(-1), uint32_t(-1));
@@ -232,8 +228,7 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     std::vector<uint32_t> begin_buckets_of_size(min_size + 1, 0);
     for (uint64_t s = 3; s <= min_size; ++s) {
         begin_buckets_of_size[s] = static_cast<uint32_t>(  //
-            begin_buckets_of_size[s - 1] +
-            buckets_stats.num_buckets_of_size(s - 1) * (s - 1));
+            begin_buckets_of_size[s - 1] + buckets_stats.num_buckets_of_size(s - 1) * (s - 1));
     }
     d.m_ssi.begin_buckets_of_size = std::move(begin_buckets_of_size);
 
@@ -270,8 +265,8 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
        compact_vector writer to assemble the final mid_load_buckets file. */
     auto mid_load_per_size_path = [&](uint64_t s) {
         std::stringstream ss;
-        ss << build_config.tmp_dirname << "/sshash.tmp.run_" << step7_run_id
-           << ".mid_load_size_" << s << ".bin";
+        ss << build_config.tmp_dirname << "/sshash.tmp.run_" << step7_run_id << ".mid_load_size_"
+           << s << ".bin";
         return ss.str();
     };
     std::vector<std::ofstream> mid_load_per_size(min_size + 1);
@@ -295,14 +290,14 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     const uint64_t skew_run_id = pthash::clock_type::now().time_since_epoch().count();
     auto request_run_filename = [&](uint64_t id) {
         std::stringstream ss;
-        ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id
-           << ".kmer_requests." << id << ".bin";
+        ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id << ".kmer_requests."
+           << id << ".bin";
         return ss.str();
     };
     auto skew_partition_filename = [&](uint64_t pid) {
         std::stringstream ss;
-        ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id
-           << ".skew_kmers." << pid << ".bin";
+        ss << build_config.tmp_dirname << "/sshash.tmp.run_" << skew_run_id << ".skew_kmers." << pid
+           << ".bin";
         return ss.str();
     };
 
@@ -311,10 +306,9 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
        so heap fragmentation across steps doesn't push peak RSS past the
        --ram-limit budget. */
     std::atomic<uint64_t> num_request_runs{0};
-    const uint64_t request_buffer_capacity = std::max<uint64_t>(
-        uint64_t(1) << 16,
-        (build_config.ram_limit_in_GiB * essentials::GiB) /
-            (8 * sizeof(kmer_extraction_request)));
+    const uint64_t request_buffer_capacity =
+        std::max<uint64_t>(uint64_t(1) << 16, (build_config.ram_limit_in_GiB * essentials::GiB) /
+                                                  (8 * sizeof(kmer_extraction_request)));
     std::vector<kmer_extraction_request> request_buffer;
     request_buffer.reserve(request_buffer_capacity);
     auto flush_request_buffer = [&]() {
@@ -424,8 +418,7 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
                         d.m_spss.strings_offsets.decode(mt.pos_in_seq).absolute_offset;
                     const uint64_t starting_pos = abs_offset - mt.pos_in_kmer;
                     if (request_buffer.size() == request_buffer_capacity) flush_request_buffer();
-                    request_buffer.emplace_back(starting_pos, uint32_t(partition_id),
-                                                pos_in_bucket,
+                    request_buffer.emplace_back(starting_pos, uint32_t(partition_id), pos_in_bucket,
                                                 uint32_t(mt.num_kmers_in_super_kmer));
                 }
             }
@@ -452,8 +445,7 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     {
         streaming_compact_vector_writer mid_load_writer;
         mid_load_writer.open(spilled.mid_load_buckets_path,
-                             num_minimizer_positions_of_buckets_larger_than_1,
-                             num_bits_per_offset);
+                             num_minimizer_positions_of_buckets_larger_than_1, num_bits_per_offset);
         uint64_t global_index = 0;
         for (uint64_t s = 2; s <= min_size; ++s) {
             const uint64_t expected = buckets_stats.num_buckets_of_size(s) * s;
@@ -518,8 +510,8 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
 
             std::string operator*() const {
                 std::stringstream ss;
-                ss << tmp_dirname << "/sshash.tmp.run_" << skew_run_id
-                   << ".kmer_requests." << i << ".bin";
+                ss << tmp_dirname << "/sshash.tmp.run_" << skew_run_id << ".kmer_requests." << i
+                   << ".bin";
                 return ss.str();
             }
             void operator++() { ++i; }
@@ -606,8 +598,7 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
         mphf_build_config.verbose = false;
         util::configure_mphf_threads_and_partition(mphf_build_config, build_config.num_threads,
                                                    build_config.ram_limit_in_GiB,
-                                                   build_config.verbose,
-                                                   "skew partition MPHF");
+                                                   build_config.verbose, "skew partition MPHF");
         mphf_build_config.ram = (build_config.ram_limit_in_GiB * essentials::GiB) / 2;
         mphf_build_config.tmp_dir = build_config.tmp_dirname;
 

From b3e49c90746b954892cf8719bbd51a4a44e483a4 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 6 May 2026 14:40:45 +0000
Subject: [PATCH 21/32] factor out buffered_record_stream<T>; remove duplicated
 read loops

The build's various ifstream-backed forward readers all
followed the same pattern (open file, fixed-capacity record
buffer, refill on exhaust, has_next/current/advance interface).
Five separate copies of that loop existed:

  - file_merging_iterator::buffered_stream      (per merge run)
  - disk_backed_offsets_builder::reader         (per-thread offsets)
  - disk_backed_offsets_builder::full_iterator  (encode/build)
  - streaming_minimizers_iterator               (codewords MPHF input)
  - streaming_minimizer_bucket_reader           (step 7.1 buckets)
  - skew_partition_kmer_iterator                (skew MPHF input)

Hoist the common primitive into a new
`include/builder/buffered_record_stream.hpp`:

    template <typename T>
    struct buffered_record_stream {
        void open(filename, buffer_records, start_byte = 0);
        void close();
        bool empty() const;
        T const& current() const;
        void advance();
    };

and reuse it from each of the five readers. The pthash-iterator
ones (full_iterator, streaming_minimizers_iterator,
skew_partition_kmer_iterator) wrap a shared_ptr to keep the
copyable-by-value contract pthash expects.

While here, define a packed `skew_kmer_record_t<Kmer>` for the
phase B/C tmp file format and use it both at the writer and the
buffered reader, so the on-disk record layout is in one place.

Verified byte-identical SSHash output on salmonella_enterica
m=7 and full --check on regular, --canonical, multi-thread
(-t 4), and --weighted.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 include/builder/buffered_record_stream.hpp    | 106 ++++++++++++++++++
 .../builder/disk_backed_offsets_builder.hpp   |  92 ++++-----------
 include/builder/file_merging_iterator.hpp     |  68 +----------
 include/builder/util.hpp                      |  83 +++++---------
 src/builder/build_sparse_and_skew_index.cpp   |  81 ++++++-------
 5 files changed, 202 insertions(+), 228 deletions(-)
 create mode 100644 include/builder/buffered_record_stream.hpp

diff --git a/include/builder/buffered_record_stream.hpp b/include/builder/buffered_record_stream.hpp
new file mode 100644
index 0000000..382505c
--- /dev/null
+++ b/include/builder/buffered_record_stream.hpp
@@ -0,0 +1,106 @@
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <fstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace sshash {
+
+/*
+    A small buffered, forward-only reader of fixed-size records over a
+    binary file. Records are read in fixed-capacity chunks (~`buffer_records
+    * sizeof(T)` bytes of RAM) so the per-instance footprint is bounded
+    independently of the file size.
+
+    Used as the underlying primitive by all of SSHash's builder readers
+    over on-disk record files (minimizer tuples, kmer requests, kmer
+    records, offset values, sorted-run records, etc.). The class is
+    move-only; for callers that need a copyable forward iterator (e.g.
+    pthash's `build_in_external_memory`, which takes an iterator by
+    value), wrap an instance in a `std::shared_ptr`.
+
+    Usage:
+        buffered_record_stream<T> s;
+        s.open(filename);
+        while (!s.empty()) {
+            consume(s.current());
+            s.advance();
+        }
+        s.close();
+*/
+template <typename T>
+struct buffered_record_stream {
+    static constexpr uint64_t default_buffer_records = 4096;
+
+    buffered_record_stream() = default;
+    buffered_record_stream(buffered_record_stream const&) = delete;
+    buffered_record_stream& operator=(buffered_record_stream const&) = delete;
+    buffered_record_stream(buffered_record_stream&&) = default;
+    buffered_record_stream& operator=(buffered_record_stream&&) = default;
+
+    /* Open `filename` for forward reading; optionally seek to byte
+       `start_byte` before priming the read window. */
+    void open(std::string const& filename,
+              uint64_t buffer_records = default_buffer_records,
+              std::streamoff start_byte = 0) {
+        m_buf.resize(std::max<uint64_t>(1, buffer_records));
+        m_in.open(filename, std::ifstream::binary);
+        if (!m_in.is_open()) {
+            throw std::runtime_error("cannot open file '" + filename + "'");
+        }
+        if (start_byte != 0) m_in.seekg(start_byte, std::ios::beg);
+        m_pos = 0;
+        m_size = 0;
+        m_eof = false;
+        refill();
+    }
+
+    void close() {
+        if (m_in.is_open()) m_in.close();
+        m_buf.clear();
+        m_buf.shrink_to_fit();
+        m_pos = 0;
+        m_size = 0;
+        m_eof = true;
+    }
+
+    bool is_open() const { return m_in.is_open(); }
+
+    /* True iff there are no more records in the stream. */
+    bool empty() const { return m_pos >= m_size; }
+
+    /* Reference to the current record. Valid until the next `advance()`. */
+    T const& current() const {
+        assert(!empty());
+        return m_buf[m_pos];
+    }
+
+    /* Move to the next record; refills the buffer from disk on demand. */
+    void advance() {
+        assert(!empty());
+        ++m_pos;
+        if (m_pos >= m_size && !m_eof) refill();
+    }
+
+private:
+    std::ifstream m_in;
+    std::vector<T> m_buf;
+    uint64_t m_pos = 0;
+    uint64_t m_size = 0;
+    bool m_eof = true;
+
+    void refill() {
+        m_pos = 0;
+        m_in.read(reinterpret_cast<char*>(m_buf.data()),
+                  static_cast<std::streamsize>(m_buf.size() * sizeof(T)));
+        const std::streamsize got = m_in.gcount();
+        m_size = static_cast<uint64_t>(got) / sizeof(T);
+        if (m_size == 0) m_eof = true;
+    }
+};
+
+}  // namespace sshash
diff --git a/include/builder/disk_backed_offsets_builder.hpp b/include/builder/disk_backed_offsets_builder.hpp
index 6be3562..ff1a24c 100644
--- a/include/builder/disk_backed_offsets_builder.hpp
+++ b/include/builder/disk_backed_offsets_builder.hpp
@@ -10,6 +10,7 @@
 #include <type_traits>
 #include <vector>
 
+#include "include/builder/buffered_record_stream.hpp"
 #include "include/offsets.hpp"
 
 namespace sshash {
@@ -130,61 +131,34 @@ struct disk_backed_offsets_builder {
         Forward-sequential reader over the offsets file. Each thread in
         compute_minimizer_tuples should construct one for its assigned
         index range; per-thread RAM footprint is the buffer size only.
+        Built on top of `buffered_record_stream<uint64_t>`.
     */
     struct reader {
         reader() = default;
-        reader(reader const&) = delete;
-        reader& operator=(reader const&) = delete;
-        reader(reader&&) = default;
-        reader& operator=(reader&&) = default;
 
         /* Open the file and seek so that the next `next()` call returns
            `*(values + start_index)`. */
         void open(std::string const& filename, uint64_t start_index,
                   uint64_t buffer_records = default_reader_buffer_records) {
-            m_buf.assign(std::max<uint64_t>(1, buffer_records), 0);
-            m_pos = 0;
-            m_size = 0;
-            m_in.open(filename, std::ifstream::binary);
-            if (!m_in.is_open()) {
-                throw std::runtime_error("cannot open offsets tmp file '" + filename + "'");
-            }
-            m_in.seekg(static_cast<std::streamoff>(start_index * sizeof(uint64_t)), std::ios::beg);
-            refill();
+            m_stream.open(filename, buffer_records,
+                          static_cast<std::streamoff>(start_index * sizeof(uint64_t)));
         }
 
-        void close() {
-            if (m_in.is_open()) m_in.close();
-            m_buf.clear();
-            m_buf.shrink_to_fit();
-            m_pos = 0;
-            m_size = 0;
-        }
+        void close() { m_stream.close(); }
 
         /* Return the next offset and advance. Caller must ensure they
            don't read past the end of the file. */
         uint64_t next() {
-            if (m_pos >= m_size) refill();
-            assert(m_pos < m_size);
-            return m_buf[m_pos++];
-        }
-
-    private:
-        std::ifstream m_in;
-        std::vector<uint64_t> m_buf;
-        uint64_t m_pos = 0;
-        uint64_t m_size = 0;
-
-        void refill() {
-            m_pos = 0;
-            m_in.read(reinterpret_cast<char*>(m_buf.data()),
-                      static_cast<std::streamsize>(m_buf.size() * sizeof(uint64_t)));
-            const std::streamsize got = m_in.gcount();
-            m_size = static_cast<uint64_t>(got) / sizeof(uint64_t);
-            if (m_size == 0) {
+            if (m_stream.empty()) {
                 throw std::runtime_error("disk_backed_offsets_builder: read past end of file");
             }
+            const uint64_t v = m_stream.current();
+            m_stream.advance();
+            return v;
         }
+
+    private:
+        buffered_record_stream<uint64_t> m_stream;
     };
 
     /* Construct a reader positioned at `start_index`. Requires freeze(). */
@@ -202,9 +176,10 @@ struct disk_backed_offsets_builder {
     /*
         A copyable forward iterator over the entire offsets file, suitable
         for the `Iterator`-template `encode` / `build` calls in
-        `bits::endpoints_sequence` and `bits::compact_vector`. Holds the
-        underlying ifstream via shared_ptr so the iterator can be copied
-        (those APIs may copy the iterator internally).
+        `bits::endpoints_sequence` and `bits::compact_vector`. Wraps a
+        shared_ptr<buffered_record_stream<uint64_t>> so the iterator is
+        copyable; copies share the underlying stream state, which is what
+        those APIs expect.
     */
     struct full_iterator {
         using iterator_category = std::forward_iterator_tag;
@@ -217,43 +192,22 @@ struct disk_backed_offsets_builder {
 
         void open(std::string const& filename,
                   uint64_t buffer_records = default_reader_buffer_records) {
-            m_state = std::make_shared<state>();
-            m_state->buf.assign(std::max<uint64_t>(1, buffer_records), 0);
-            m_state->in.open(filename, std::ifstream::binary);
-            if (!m_state->in.is_open()) {
-                throw std::runtime_error("cannot open offsets tmp file '" + filename + "'");
-            }
-            m_state->refill();
+            m_stream = std::make_shared<buffered_record_stream<uint64_t>>();
+            m_stream->open(filename, buffer_records);
         }
 
         uint64_t operator*() const {
-            assert(m_state && m_state->pos < m_state->size);
-            return m_state->buf[m_state->pos];
+            assert(m_stream);
+            return m_stream->current();
         }
         full_iterator& operator++() {
-            assert(m_state);
-            ++m_state->pos;
-            if (m_state->pos >= m_state->size && !m_state->eof) m_state->refill();
+            assert(m_stream);
+            m_stream->advance();
             return *this;
         }
 
     private:
-        struct state {
-            std::ifstream in;
-            std::vector<uint64_t> buf;
-            uint64_t pos = 0;
-            uint64_t size = 0;
-            bool eof = false;
-            void refill() {
-                pos = 0;
-                in.read(reinterpret_cast<char*>(buf.data()),
-                        static_cast<std::streamsize>(buf.size() * sizeof(uint64_t)));
-                const std::streamsize got = in.gcount();
-                size = static_cast<uint64_t>(got) / sizeof(uint64_t);
-                if (size == 0) eof = true;
-            }
-        };
-        std::shared_ptr<state> m_state;
+        std::shared_ptr<buffered_record_stream<uint64_t>> m_stream;
     };
 
     /*
diff --git a/include/builder/file_merging_iterator.hpp b/include/builder/file_merging_iterator.hpp
index ff191ee..37cf24c 100644
--- a/include/builder/file_merging_iterator.hpp
+++ b/include/builder/file_merging_iterator.hpp
@@ -7,6 +7,7 @@
 #include <string>
 #include <vector>
 
+#include "buffered_record_stream.hpp"
 #include "util.hpp"
 
 namespace sshash {
@@ -79,70 +80,9 @@ struct file_merging_iterator  //
     }
 
 private:
-    /*
-        A buffered, forward-only reader over a single run file. Reads in
-        chunks of `m_buf.size()` records via std::ifstream and presents a
-        T-by-reference current-value interface.
-    */
-    struct buffered_stream {
-        buffered_stream() = default;
-        buffered_stream(buffered_stream const&) = delete;
-        buffered_stream& operator=(buffered_stream const&) = delete;
-        buffered_stream(buffered_stream&&) = default;
-        buffered_stream& operator=(buffered_stream&&) = default;
-
-        void open(std::string const& filename, uint64_t buffer_records) {
-            m_buf.resize(std::max<uint64_t>(1, buffer_records));
-            m_in.open(filename, std::ifstream::binary);
-            if (!m_in.is_open()) {
-                throw std::runtime_error("cannot open run file '" + filename + "'");
-            }
-            m_pos = 0;
-            m_size = 0;
-            m_eof = false;
-            refill();
-        }
-
-        void close() {
-            if (m_in.is_open()) m_in.close();
-            m_buf.clear();
-            m_buf.shrink_to_fit();
-            m_pos = 0;
-            m_size = 0;
-            m_eof = true;
-        }
-
-        bool empty() const { return m_pos >= m_size; }
-
-        T const& current() const {
-            assert(!empty());
-            return m_buf[m_pos];
-        }
-
-        void advance() {
-            assert(!empty());
-            ++m_pos;
-            if (m_pos >= m_size && !m_eof) refill();
-        }
-
-    private:
-        std::ifstream m_in;
-        std::vector<T> m_buf;
-        uint64_t m_pos = 0;
-        uint64_t m_size = 0;
-        bool m_eof = true;
-
-        void refill() {
-            m_pos = 0;
-            m_in.read(reinterpret_cast<char*>(m_buf.data()),
-                      static_cast<std::streamsize>(m_buf.size() * sizeof(T)));
-            const std::streamsize got = m_in.gcount();
-            m_size = static_cast<uint64_t>(got) / sizeof(T);
-            if (m_size == 0) m_eof = true;
-        }
-    };
-
-    std::vector<buffered_stream> m_streams;
+    /* Each input run is read via a small buffered ifstream. */
+    using stream_t = buffered_record_stream<T>;
+    std::vector<stream_t> m_streams;
     std::vector<uint32_t> m_tree;
 
     uint64_t m_begin = 0, m_size = 0;
diff --git a/include/builder/util.hpp b/include/builder/util.hpp
index bd57038..72ebd21 100644
--- a/include/builder/util.hpp
+++ b/include/builder/util.hpp
@@ -5,6 +5,7 @@
 #include <memory>
 #include <vector>
 
+#include "buffered_record_stream.hpp"
 #include "file_merging_iterator.hpp"
 #include "parallel_sort.hpp"
 
@@ -159,12 +160,14 @@ struct minimizers_tuples_iterator {
     Streaming forward iterator over a sorted minimizers tmp file that
     yields each distinct `minimizer` value exactly once (i.e., one value
     per bucket). Equivalent to `minimizers_tuples_iterator` over an mmap'd
-    buffer, but reads from std::ifstream so RAM usage is constant.
+    buffer, but built on top of `buffered_record_stream<minimizer_tuple>`
+    so RAM usage is constant.
 
     Copyable: pthash's `build_in_external_memory` takes the iterator by
-    value, so the underlying ifstream is held via shared_ptr. Copies share
-    the stream state; pthash's local copy advances the shared stream, and
-    the original at the call site is unused after the build returns.
+    value, so the underlying buffered stream is held via shared_ptr.
+    Copies share the stream state; pthash's local copy advances the
+    shared stream, and the original at the call site is unused after the
+    build returns.
 */
 struct streaming_minimizers_iterator {
     using iterator_category = std::forward_iterator_tag;
@@ -176,25 +179,14 @@ struct streaming_minimizers_iterator {
     streaming_minimizers_iterator() = default;
 
     void open(std::string const& filename) {
-        m_in = std::make_shared<std::ifstream>(filename, std::ifstream::binary);
-        if (!m_in->is_open()) {
-            throw std::runtime_error("cannot open minimizers tmp file '" + filename + "'");
-        }
-        m_eof = false;
-        m_current = uint64_t(-1);
-        // Bootstrap: read the first tuple.
-        minimizer_tuple t;
-        m_in->read(reinterpret_cast<char*>(&t), sizeof(minimizer_tuple));
-        if (m_in->gcount() != static_cast<std::streamsize>(sizeof(minimizer_tuple))) {
-            m_eof = true;
-            return;
-        }
-        m_current = t.minimizer;
+        m_stream = std::make_shared<buffered_record_stream<minimizer_tuple>>();
+        m_stream->open(filename);
+        m_current = m_stream->empty() ? uint64_t(-1) : m_stream->current().minimizer;
     }
 
     void close() {
-        if (m_in && m_in->is_open()) m_in->close();
-        m_in.reset();
+        if (m_stream) m_stream->close();
+        m_stream.reset();
     }
 
     uint64_t operator*() const { return m_current; }
@@ -204,21 +196,16 @@ struct streaming_minimizers_iterator {
     }
 
 private:
-    std::shared_ptr<std::ifstream> m_in;
+    std::shared_ptr<buffered_record_stream<minimizer_tuple>> m_stream;
     uint64_t m_current = uint64_t(-1);
-    bool m_eof = true;
 
     void advance_to_next_minimizer() {
         const uint64_t prev = m_current;
-        minimizer_tuple t;
-        while (true) {
-            m_in->read(reinterpret_cast<char*>(&t), sizeof(minimizer_tuple));
-            if (m_in->gcount() != static_cast<std::streamsize>(sizeof(minimizer_tuple))) {
-                m_eof = true;
-                return;  // m_current holds last value; pthash has consumed `num_minimizers` keys
-            }
-            if (t.minimizer != prev) {
-                m_current = t.minimizer;
+        while (!m_stream->empty()) {
+            m_stream->advance();
+            if (m_stream->empty()) return;  // m_current holds last value
+            if (m_stream->current().minimizer != prev) {
+                m_current = m_stream->current().minimizer;
                 return;
             }
         }
@@ -236,43 +223,27 @@ struct streaming_minimizers_iterator {
     inputs this peaks at max_bucket_size * sizeof(minimizer_tuple).
 */
 struct streaming_minimizer_bucket_reader {
-    void open(std::string const& filename) {
-        m_in.open(filename, std::ifstream::binary);
-        if (!m_in.is_open()) {
-            throw std::runtime_error("cannot open minimizers tmp file '" + filename + "'");
-        }
-        // Read first record into the lookahead slot, if any.
-        m_in.read(reinterpret_cast<char*>(&m_lookahead), sizeof(minimizer_tuple));
-        m_eof = (m_in.gcount() != static_cast<std::streamsize>(sizeof(minimizer_tuple)));
-    }
+    void open(std::string const& filename) { m_stream.open(filename); }
 
-    void close() {
-        if (m_in.is_open()) m_in.close();
-    }
+    void close() { m_stream.close(); }
 
-    bool has_next_bucket() const { return !m_eof; }
+    bool has_next_bucket() const { return !m_stream.empty(); }
 
     /* Read the next bucket into `bucket_out` (cleared first). All tuples in
        a bucket share the same minimizer. Returns the bucket's minimizer. */
     uint64_t next_bucket(std::vector<minimizer_tuple>& bucket_out) {
         bucket_out.clear();
-        assert(!m_eof);
-        const uint64_t mm = m_lookahead.minimizer;
+        assert(has_next_bucket());
+        const uint64_t mm = m_stream.current().minimizer;
         do {
-            bucket_out.push_back(m_lookahead);
-            m_in.read(reinterpret_cast<char*>(&m_lookahead), sizeof(minimizer_tuple));
-            if (m_in.gcount() != static_cast<std::streamsize>(sizeof(minimizer_tuple))) {
-                m_eof = true;
-                break;
-            }
-        } while (m_lookahead.minimizer == mm);
+            bucket_out.push_back(m_stream.current());
+            m_stream.advance();
+        } while (!m_stream.empty() && m_stream.current().minimizer == mm);
         return mm;
     }
 
 private:
-    std::ifstream m_in;
-    minimizer_tuple m_lookahead;
-    bool m_eof = true;
+    buffered_record_stream<minimizer_tuple> m_stream;
 };
 
 struct minimizers_tuples {
diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp
index a861093..5bf5636 100644
--- a/src/builder/build_sparse_and_skew_index.cpp
+++ b/src/builder/build_sparse_and_skew_index.cpp
@@ -56,17 +56,30 @@ struct position_tuple {
 #pragma pack(pop)
 
 /*
-    Forward iterator over a per-skew-partition tmp file produced by step
-    7.2 phase (B). Each record is `(kmer.bits, uint32_t pos_in_bucket)`.
-    This iterator yields successive Kmer values, exposing the minimal
-    interface (`*it`, `++it`) that pthash's external-memory partitioned PHF
-    builder consumes.
+    Per-skew-partition tmp file record (written by step 7.2 phase (B),
+    consumed by phase (C)): a kmer's bit pattern + the pos_in_bucket
+    we'll later pack into the partition's positions compact_vector.
+*/
+#pragma pack(push, 4)
+template <typename Kmer>
+struct skew_kmer_record_t {
+    using kmer_bits_t = decltype(Kmer{}.bits);
+    kmer_bits_t kmer_bits;
+    uint32_t pib;
+};
+#pragma pack(pop)
+
+/*
+    Forward iterator over a per-skew-partition tmp file produced by phase
+    (B). Yields successive Kmer values via the minimal interface (`*it`,
+    `++it`) that pthash's external-memory partitioned PHF builder
+    consumes.
 
     pthash takes the iterator by value, so it must be copyable. The
-    underlying `ifstream` is held via `shared_ptr` and shared between
-    copies; pthash's copy advances the shared stream state, which is fine
-    because the original at the call site is no longer used after the
-    build call returns.
+    underlying buffered_record_stream is held via shared_ptr and shared
+    between copies; pthash's copy advances the shared stream state, which
+    is fine because the original at the call site is unused after the
+    build returns.
 */
 template <typename Kmer>
 struct skew_partition_kmer_iterator {
@@ -79,36 +92,28 @@ struct skew_partition_kmer_iterator {
     skew_partition_kmer_iterator() = default;
 
     void open(std::string const& filename) {
-        m_in = std::make_shared<std::ifstream>(filename, std::ifstream::binary);
-        if (!m_in->is_open()) {
-            throw std::runtime_error("cannot open skew-partition tmp file '" + filename + "'");
-        }
-        advance();
+        m_stream = std::make_shared<buffered_record_stream<skew_kmer_record_t<Kmer>>>();
+        m_stream->open(filename);
+        if (!m_stream->empty()) m_current.bits = m_stream->current().kmer_bits;
     }
 
     void close() {
-        if (m_in && m_in->is_open()) m_in->close();
-        m_in.reset();
+        if (m_stream) m_stream->close();
+        m_stream.reset();
     }
 
     Kmer const& operator*() const { return m_current; }
     skew_partition_kmer_iterator& operator++() {
-        advance();
+        if (!m_stream->empty()) {
+            m_stream->advance();
+            if (!m_stream->empty()) m_current.bits = m_stream->current().kmer_bits;
+        }
         return *this;
     }
 
 private:
-    std::shared_ptr<std::ifstream> m_in;
+    std::shared_ptr<buffered_record_stream<skew_kmer_record_t<Kmer>>> m_stream;
     Kmer m_current;
-
-    void advance() {
-        decltype(Kmer{}.bits) bits;
-        m_in->read(reinterpret_cast<char*>(&bits), sizeof(bits));
-        if (m_in->gcount() != static_cast<std::streamsize>(sizeof(bits))) return;
-        uint32_t pib;
-        m_in->read(reinterpret_cast<char*>(&pib), sizeof(pib));  // skip pos_in_bucket
-        m_current.bits = bits;
-    }
 };
 
 template <typename Kmer, typename Offsets>
@@ -551,9 +556,8 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
                     kmer = std::min(kmer, kmer_rc);
                 }
                 auto& w = partition_writers[req.partition_id];
-                w.write(reinterpret_cast<char const*>(&kmer.bits), sizeof(kmer.bits));
-                w.write(reinterpret_cast<char const*>(&req.pos_in_bucket),
-                        sizeof(req.pos_in_bucket));
+                skew_kmer_record_t<Kmer> rec{kmer.bits, req.pos_in_bucket};
+                w.write(reinterpret_cast<char const*>(&rec), sizeof(rec));
                 kmer_it.next();
             }
             merger.next();
@@ -703,20 +707,19 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
                         pos_buffer.clear();
                     };
 
-                    std::ifstream in(kmer_fn, std::ifstream::binary);
-                    if (!in.is_open()) {
-                        throw std::runtime_error("cannot open skew-partition tmp file");
-                    }
+                    buffered_record_stream<skew_kmer_record_t<Kmer>> rec_stream;
+                    rec_stream.open(kmer_fn);
                     for (uint64_t i = 0; i != n; ++i) {
+                        assert(!rec_stream.empty());
+                        auto const& rec = rec_stream.current();
                         Kmer kmer;
-                        in.read(reinterpret_cast<char*>(&kmer.bits), sizeof(kmer.bits));
-                        uint32_t pib;
-                        in.read(reinterpret_cast<char*>(&pib), sizeof(pib));
+                        kmer.bits = rec.kmer_bits;
                         const uint64_t pos = F(kmer);
                         if (pos_buffer.size() == pos_buffer_capacity) flush_pos_buffer();
-                        pos_buffer.emplace_back(pos, pib);
+                        pos_buffer.emplace_back(pos, rec.pib);
+                        rec_stream.advance();
                     }
-                    in.close();
+                    rec_stream.close();
                     std::remove(kmer_fn.c_str());
                     flush_pos_buffer();
                 }

From a35c36458ee040f2a2c0caecd084e255cc5565a1 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 6 May 2026 14:41:01 +0000
Subject: [PATCH 22/32] build: add --no-streaming-save flag for in-RAM save
 path

Currently the build CLI picks between two paths:

  -o without --check         -> dictionary::build_streaming_save
                                (spilled components are stitched
                                into the output via the streaming
                                saver; `dict` is not query-ready
                                afterward)
  -o with --check (or no -o) -> dictionary::build
                                (spilled components are
                                materialized back into `dict`,
                                then optionally essentials::save)

For users with plenty of RAM who don't want the streaming-save
tmp-file concatenation (and don't need --check), expose the
in-RAM save path explicitly via --no-streaming-save. When set,
the build does build() + essentials::save: peak RSS at save time
briefly equals the in-RAM index size, but the save is a single
pass over `dict` rather than a stitched concatenation. Useful
when the user already pays the memory cost (e.g., to query the
dict immediately afterward in another tool, or just prefers the
simpler save path).

Both flows produce byte-identical output files; the flag only
affects the save path.

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 tools/build.cpp | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/tools/build.cpp b/tools/build.cpp
index fbc7d1d..1a530c0 100644
--- a/tools/build.cpp
+++ b/tools/build.cpp
@@ -46,6 +46,13 @@ int build(int argc, char** argv) {
                true);
     parser.add("check", "Check correctness after construction.", "--check", false, true);
     parser.add("verbose", "Verbose output during construction.", "--verbose", false, true);
+    parser.add("no_streaming_save",
+               "Force the in-RAM save path even with -o: build, materialize the dictionary in RAM, "
+               "then write it via essentials::save. Peak RSS at save time briefly equals the "
+               "in-RAM index size; useful when the user has plenty of memory and wants a single "
+               "save call rather than the streaming-save tmp-file concatenation. Implied by "
+               "--check (which always materializes for query).",
+               "--no-streaming-save", false, true);
 
     if (!parser.parse()) return 0;
 
@@ -74,19 +81,28 @@ int build(int argc, char** argv) {
     // build_config.print();
 
     bool check = parser.get<bool>("check");
+    bool no_streaming_save = parser.get<bool>("no_streaming_save");
     bool has_output = parser.parsed("output_filename");
 
     dictionary_type dict;
 
-    if (has_output && !check) {
+    if (has_output && !check && !no_streaming_save) {
         /* Streaming-save path: keeps peak RAM bounded by the build phase
-           (the strings bit-vector is never fully in RAM). After this returns
-           `dict` is not query-ready; reload from disk to query. */
+           (the strings bit-vector and the spilled compact_vectors / MPHFs
+           are never fully in RAM). After this returns `dict` is not
+           query-ready; reload from disk to query. */
         auto output_filename = parser.get<std::string>("output_filename");
         essentials::logger("building data structure (streaming save)...");
         dict.build_streaming_save(input_filename, build_config, output_filename);
         essentials::logger("DONE");
     } else {
+        /* In-RAM save path. The build still spills internally for
+           bounded-RAM construction, but at the end every spilled
+           component is materialized back into `dict` so it's
+           query-ready. Used whenever --check is requested (queries need
+           `dict` populated) or when the user explicitly opts in via
+           --no-streaming-save. Peak RSS briefly hits the full index
+           size at save time. */
         essentials::logger("building data structure...");
         dict.build(input_filename, build_config);
 

From d7dc21dcc4655dcdf3b81cd50dbb967140ba6b3e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 6 May 2026 16:07:51 +0000
Subject: [PATCH 23/32] Revert "build: add --no-streaming-save flag for in-RAM
 save path"

This reverts commit a35c36458ee040f2a2c0caecd084e255cc5565a1.
---
 tools/build.cpp | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/tools/build.cpp b/tools/build.cpp
index 1a530c0..fbc7d1d 100644
--- a/tools/build.cpp
+++ b/tools/build.cpp
@@ -46,13 +46,6 @@ int build(int argc, char** argv) {
                true);
     parser.add("check", "Check correctness after construction.", "--check", false, true);
     parser.add("verbose", "Verbose output during construction.", "--verbose", false, true);
-    parser.add("no_streaming_save",
-               "Force the in-RAM save path even with -o: build, materialize the dictionary in RAM, "
-               "then write it via essentials::save. Peak RSS at save time briefly equals the "
-               "in-RAM index size; useful when the user has plenty of memory and wants a single "
-               "save call rather than the streaming-save tmp-file concatenation. Implied by "
-               "--check (which always materializes for query).",
-               "--no-streaming-save", false, true);
 
     if (!parser.parse()) return 0;
 
@@ -81,28 +74,19 @@ int build(int argc, char** argv) {
     // build_config.print();
 
     bool check = parser.get<bool>("check");
-    bool no_streaming_save = parser.get<bool>("no_streaming_save");
     bool has_output = parser.parsed("output_filename");
 
     dictionary_type dict;
 
-    if (has_output && !check && !no_streaming_save) {
+    if (has_output && !check) {
         /* Streaming-save path: keeps peak RAM bounded by the build phase
-           (the strings bit-vector and the spilled compact_vectors / MPHFs
-           are never fully in RAM). After this returns `dict` is not
-           query-ready; reload from disk to query. */
+           (the strings bit-vector is never fully in RAM). After this returns
+           `dict` is not query-ready; reload from disk to query. */
         auto output_filename = parser.get<std::string>("output_filename");
         essentials::logger("building data structure (streaming save)...");
         dict.build_streaming_save(input_filename, build_config, output_filename);
         essentials::logger("DONE");
     } else {
-        /* In-RAM save path. The build still spills internally for
-           bounded-RAM construction, but at the end every spilled
-           component is materialized back into `dict` so it's
-           query-ready. Used whenever --check is requested (queries need
-           `dict` populated) or when the user explicitly opts in via
-           --no-streaming-save. Peak RSS briefly hits the full index
-           size at save time. */
         essentials::logger("building data structure...");
         dict.build(input_filename, build_config);
 

From d8328511d3a59ecae62276986fd945aa8a9f7769 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 6 May 2026 16:38:37 +0000
Subject: [PATCH 24/32] finalize_stats: print total bits/kmer also in
 streaming-save flow
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the streaming-save flow skipped print_space_breakdown
(d.m_spss.strings is an empty placeholder there, so calling
breakdown would just print zeros), which meant the user got no
size summary at the end. Master always prints "total ...
bits/kmer" though.

Stat the saved file's size when `d` isn't materialized and
print the total directly. The in-RAM flow keeps the existing
per-component breakdown unchanged. Cost is one fstat at the
very end of the build.

Also fixes index_size_in_bytes in the JSON-line build_stats
output for the streaming-save flow (it used to report just the
in-RAM-resident bytes ≈ a few hundred MB instead of the actual
on-disk index size).

https://claude.ai/code/session_01BShS2GDASvEsCAbgJyQVBK
---
 include/builder/dictionary_builder.hpp | 32 +++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp
index 4cf8d0c..4475bb6 100644
--- a/include/builder/dictionary_builder.hpp
+++ b/include/builder/dictionary_builder.hpp
@@ -146,7 +146,7 @@ struct dictionary_builder  //
             strings_builder.remove_file();
             spilled.clear_files();
         });
-        finalize_stats(d);
+        finalize_stats(d, output_filename);
     }
 
     build_configuration build_config;
@@ -272,16 +272,36 @@ struct dictionary_builder  //
         });
     }
 
-    void finalize_stats(dictionary<Kmer, Offsets>& d) {
+    void finalize_stats(dictionary<Kmer, Offsets>& d, std::string const& saved_path = "") {
+        /* For the materialize-to-RAM flow `d` is fully populated and we
+           can call `d.print_space_breakdown()` / `d.num_bits()` directly.
+           For the streaming-save flow `d`'s spilled components are empty
+           placeholders, so we read the on-disk index file's size for the
+           total — this is just a stat, no recomputation. */
+        const bool d_is_populated = d.m_spss.strings.num_bits() > 0;
+        uint64_t num_bytes = 0;
+        if (d_is_populated) {
+            num_bytes = (d.num_bits() + 7) / 8;
+        } else if (!saved_path.empty()) {
+            std::ifstream f(saved_path, std::ios::binary | std::ios::ate);
+            if (f.is_open()) num_bytes = static_cast<uint64_t>(f.tellg());
+        }
+
         if (build_config.verbose) {
             print_time(total_time_musec, "total time");
-            /* `print_space_breakdown` reads d.m_spss.strings; only safe in
-               the materialize-to-RAM flow. */
-            if (d.m_spss.strings.num_bits() > 0) d.print_space_breakdown();
+            if (d_is_populated) {
+                d.print_space_breakdown();
+            } else if (num_bytes > 0) {
+                std::cout << "total index size: " << num_bytes << " [B] -- "
+                          << essentials::convert(num_bytes, essentials::MB) << " [MB]\n";
+                std::cout << "  total: "
+                          << (num_kmers > 0 ? (8.0 * num_bytes) / num_kmers : 0.0)
+                          << " [bits/kmer]" << std::endl;
+            }
         }
 
         build_stats.add("total_build_time_in_microsec", total_time_musec);
-        build_stats.add("index_size_in_bytes", (d.num_bits() + 7) / 8);
+        build_stats.add("index_size_in_bytes", num_bytes);
         build_stats.add("num_kmers", d.num_kmers());
 
         if (build_config.verbose) build_stats.print();

From 030f1d0774added90c5ce5fe96087cbc59c3851a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 6 May 2026 16:49:55 +0000
Subject: [PATCH 25/32] build_stats: format step timings as seconds with [sec]
 unit

All build-step durations in the JSON-line build_stats output were raw
microseconds, which was hard to read. Convert them to "X.XXX [sec]" via
a small `musec_as_seconds_str` helper. Steps 7.1 and 7.2 are added
directly inside build_sparse_and_skew_index.cpp, so apply the same
helper there too.

Also switch finalize_stats to std::filesystem::file_size for the saved
index size, instead of an fstream + tellg.
---
 include/builder/dictionary_builder.hpp      | 23 ++++++++++++++++-----
 src/builder/build_sparse_and_skew_index.cpp |  6 ++++--
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp
index 4475bb6..49e8589 100644
--- a/include/builder/dictionary_builder.hpp
+++ b/include/builder/dictionary_builder.hpp
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <cstdio>
+#include <filesystem>
 #include <unordered_map>
 
 #include "essentials.hpp"
@@ -277,14 +279,16 @@ struct dictionary_builder  //
            can call `d.print_space_breakdown()` / `d.num_bits()` directly.
            For the streaming-save flow `d`'s spilled components are empty
            placeholders, so we read the on-disk index file's size for the
-           total — this is just a stat, no recomputation. */
+           total via `std::filesystem::file_size` — direct OS stat, no
+           recomputation. */
         const bool d_is_populated = d.m_spss.strings.num_bits() > 0;
         uint64_t num_bytes = 0;
         if (d_is_populated) {
             num_bytes = (d.num_bits() + 7) / 8;
         } else if (!saved_path.empty()) {
-            std::ifstream f(saved_path, std::ios::binary | std::ios::ate);
-            if (f.is_open()) num_bytes = static_cast<uint64_t>(f.tellg());
+            std::error_code ec;
+            const auto sz = std::filesystem::file_size(saved_path, ec);
+            if (!ec) num_bytes = static_cast<uint64_t>(sz);
         }
 
         if (build_config.verbose) {
@@ -300,7 +304,7 @@ struct dictionary_builder  //
             }
         }
 
-        build_stats.add("total_build_time_in_microsec", total_time_musec);
+        build_stats.add("total_build_time", musec_as_seconds_str(total_time_musec).c_str());
         build_stats.add("index_size_in_bytes", num_bytes);
         build_stats.add("num_kmers", d.num_kmers());
 
@@ -312,6 +316,15 @@ struct dictionary_builder  //
                   << (time_in_musec * 1000) / num_kmers << " [ns/kmer])" << std::endl;
     }
 
+    /* Format a microsecond count as e.g. "7.292 [sec]" for the JSON-line
+       build_stats output. Three decimals = millisecond precision, which is
+       both compact and plenty precise for build-step durations. */
+    static std::string musec_as_seconds_str(uint64_t musec) {
+        char buf[64];
+        std::snprintf(buf, sizeof(buf), "%.3f [sec]", static_cast<double>(musec) / 1.0e6);
+        return std::string(buf);
+    }
+
     template <typename Callback>
     void do_step(std::string const& step, Callback const& f) {
         timer.start();
@@ -320,7 +333,7 @@ struct dictionary_builder  //
         uint64_t step_elapsed_time_musec = timer.elapsed();
         total_time_musec += step_elapsed_time_musec;
         if (build_config.verbose) print_time(step_elapsed_time_musec, step);
-        build_stats.add(step, step_elapsed_time_musec);
+        build_stats.add(step, musec_as_seconds_str(step_elapsed_time_musec).c_str());
         timer.reset();
     }
 
diff --git a/src/builder/build_sparse_and_skew_index.cpp b/src/builder/build_sparse_and_skew_index.cpp
index 5bf5636..e7c1a17 100644
--- a/src/builder/build_sparse_and_skew_index.cpp
+++ b/src/builder/build_sparse_and_skew_index.cpp
@@ -474,7 +474,8 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
     }
 
     timer.stop();
-    build_stats.add("step 7.1 (build sparse index)", uint64_t(timer.elapsed()));
+    build_stats.add("step 7.1 (build sparse index)",
+                    musec_as_seconds_str(uint64_t(timer.elapsed())).c_str());
     if (build_config.verbose) {
         print_time(uint64_t(timer.elapsed()), "step 7.1 (build sparse index)");
     }
@@ -791,7 +792,8 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
 
     timer.stop();
 
-    build_stats.add("step 7.2 (build skew index)", uint64_t(timer.elapsed()));
+    build_stats.add("step 7.2 (build skew index)",
+                    musec_as_seconds_str(uint64_t(timer.elapsed())).c_str());
 
     if (build_config.verbose) {
         print_time(uint64_t(timer.elapsed()), "step 7.2 (build skew index)");

From 659b9561010bc72145b6274bcc58cdba63276171 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 6 May 2026 16:57:57 +0000
Subject: [PATCH 26/32] remove dead bucket_type / minimizers_tuples_iterator

These two structs in include/builder/util.hpp were the in-RAM iterator
types that walked an mmap'd minimizer-tuples buffer. The build pipeline
now reads minimizers via streaming_minimizers_iterator and
streaming_minimizer_bucket_reader (both built on buffered_record_stream),
and nothing else referenced bucket_type or minimizers_tuples_iterator.

Also drop the now-stale comment references to minimizers_tuples_iterator
in the surviving streaming iterators' docstrings.
---
 include/builder/dictionary_builder.hpp |   3 +-
 include/builder/util.hpp               | 103 +------------------------
 2 files changed, 4 insertions(+), 102 deletions(-)

diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp
index 49e8589..e8cf02c 100644
--- a/include/builder/dictionary_builder.hpp
+++ b/include/builder/dictionary_builder.hpp
@@ -345,8 +345,7 @@ struct dictionary_builder  //
     void build_mphf(dictionary<Kmer, Offsets>& d) {
         const uint64_t num_minimizers = minimizers.num_minimizers();
         /* Stream minimizers from disk via std::ifstream (no mmap); the
-           iterator yields each distinct minimizer once, matching what
-           `minimizers_tuples_iterator` did over the mmap'd file. */
+           iterator yields each distinct minimizer once. */
         streaming_minimizers_iterator iterator;
         iterator.open(minimizers.get_minimizers_filename());
         d.m_ssi.codewords.build(iterator, num_minimizers, build_config);
diff --git a/include/builder/util.hpp b/include/builder/util.hpp
index 72ebd21..94761b9 100644
--- a/include/builder/util.hpp
+++ b/include/builder/util.hpp
@@ -61,106 +61,10 @@ inline std::ostream& operator<<(std::ostream& os, minimizer_tuple const& mt) {
     return os;
 }
 
-struct bucket_type {
-    bucket_type(minimizer_tuple const* begin, minimizer_tuple const* end)
-        : m_begin(begin), m_end(end) {}
-
-    struct iterator {
-        iterator(minimizer_tuple const* begin) : m_begin(begin) {}
-
-        inline minimizer_tuple operator*() const { return *m_begin; }
-        inline void operator++() { ++m_begin; }
-        bool operator==(iterator const& other) const { return m_begin == other.m_begin; }
-        bool operator!=(iterator const& other) const { return !(*this == other); }
-
-    private:
-        minimizer_tuple const* m_begin;
-    };
-
-    iterator begin() const { return iterator(m_begin); }
-    iterator end() const { return iterator(m_end); }
-
-    /*
-        When a canonical index is built (option `--canonical`),
-        a minimizer offset can correspond to more than one super-kmer.
-        A super-kmer is uniquely identified by the couple
-          (minimizer offset, position of minimizer in the first kmer of the super-kmer).
-        These two components, together, give the
-        starting position of a super-kmer in the sequence.
-
-        So the method size() returns the number of minimizer
-        positions which is <= the number of superkmers.
-    */
-
-    uint64_t num_super_kmers() const { return std::distance(m_begin, m_end); }
-
-    uint64_t size() const {
-        uint64_t num_minimizer_positions = 0;
-        uint64_t prev_pos_in_seq = constants::invalid_uint64;
-        auto const* begin = m_begin;
-        while (begin != m_end) {
-            uint64_t pos_in_seq = (*begin).pos_in_seq;
-            if (pos_in_seq != prev_pos_in_seq) {
-                ++num_minimizer_positions;
-                prev_pos_in_seq = pos_in_seq;
-            }
-            ++begin;
-        }
-        assert(num_minimizer_positions <= num_super_kmers());
-        return num_minimizer_positions;
-    }
-
-    minimizer_tuple const* begin_ptr() const { return m_begin; }
-    minimizer_tuple const* end_ptr() const { return m_end; }
-
-private:
-    minimizer_tuple const* m_begin;
-    minimizer_tuple const* m_end;
-};
-
-/*
-    Iterate over the "bucket" of a minimizer, i.e.,
-    the sorted list of minimizer tuples
-    (minimizer, pos_in_seq, pos_in_kmer, num_kmers_in_superkmer).
-*/
-struct minimizers_tuples_iterator {
-    typedef minimizer_tuple value_type;
-    using iterator_category = std::forward_iterator_tag;
-
-    minimizers_tuples_iterator(minimizer_tuple const* begin, minimizer_tuple const* end)
-        : m_bucket_begin(begin), m_bucket_end(begin), m_end(end) {
-        m_bucket_end = next_begin();
-    }
-
-    inline uint64_t minimizer() const { return (*m_bucket_begin).minimizer; }
-    inline uint64_t operator*() const { return minimizer(); }
-    inline void next() {
-        m_bucket_begin = m_bucket_end;
-        m_bucket_end = next_begin();
-    }
-    inline void operator++() { next(); }
-    bool has_next() const { return m_bucket_begin != m_end; }
-    bucket_type bucket() const { return bucket_type(m_bucket_begin, m_bucket_end); }
-
-private:
-    minimizer_tuple const* m_bucket_begin;
-    minimizer_tuple const* m_bucket_end;
-    minimizer_tuple const* m_end;
-
-    minimizer_tuple const* next_begin() {
-        if (m_bucket_begin == m_end) return m_end;
-        minimizer_tuple const* begin = m_bucket_begin;
-        uint64_t prev_minimizer = begin->minimizer;
-        while (++begin != m_end and begin->minimizer == prev_minimizer) {}
-        return begin;
-    }
-};
-
 /*
     Streaming forward iterator over a sorted minimizers tmp file that
     yields each distinct `minimizer` value exactly once (i.e., one value
-    per bucket). Equivalent to `minimizers_tuples_iterator` over an mmap'd
-    buffer, but built on top of `buffered_record_stream<minimizer_tuple>`
+    per bucket), built on top of `buffered_record_stream<minimizer_tuple>`
     so RAM usage is constant.
 
     Copyable: pthash's `build_in_external_memory` takes the iterator by
@@ -215,9 +119,8 @@ struct streaming_minimizers_iterator {
 /*
     Streaming reader over a minimizers tmp file. Reads minimizer_tuple
     records via std::ifstream (no mmap), and groups consecutive tuples by
-    minimizer into "buckets" — exactly as `minimizers_tuples_iterator` does
-    over an mmap'd buffer, but with bounded RAM (~ one bucket at a time
-    plus one record of lookahead).
+    minimizer into "buckets" with bounded RAM (~ one bucket at a time plus
+    one record of lookahead).
 
     The caller passes a vector to receive the bucket's tuples; for typical
     inputs this peaks at max_bucket_size * sizeof(minimizer_tuple).

From 1d765419af20bca71547bb8f0a77d12133555a12 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 6 May 2026 19:02:16 +0200
Subject: [PATCH 27/32] minor

---
 include/constants.hpp |  4 ----
 include/offsets.hpp   |  6 ++++--
 include/util.hpp      | 27 ++++++++++++---------------
 src/builder/build.cpp |  8 +++-----
 4 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/include/constants.hpp b/include/constants.hpp
index a020a6a..ec779b6 100644
--- a/include/constants.hpp
+++ b/include/constants.hpp
@@ -4,10 +4,6 @@ namespace sshash::constants {
 
 constexpr uint64_t invalid_uint64 = uint64_t(-1);
 constexpr uint64_t default_ram_limit_in_GiB = 8;
-/* Floor on --ram-limit. Below this the build's streaming buffers + pthash's
-   internal working memory can't usefully be made to fit; rather than degrade
-   further at very tight budgets, we clamp `-g` to at least this value
-   (modest by today's desktop standards). */
 constexpr uint64_t min_ram_limit_in_GiB = 4;
 constexpr uint64_t seed = 1;
 
diff --git a/include/offsets.hpp b/include/offsets.hpp
index b592e7b..6307471 100644
--- a/include/offsets.hpp
+++ b/include/offsets.hpp
@@ -5,7 +5,8 @@
 
 namespace sshash {
 
-template <typename> struct disk_backed_offsets_builder;
+template <typename>
+struct disk_backed_offsets_builder;
 
 struct num_bits {
     num_bits() : per_absolute_offset(0), per_relative_offset(0), per_string_id(0) {}
@@ -106,7 +107,8 @@ struct offsets  //
     /* Allow disk_backed_offsets_builder to populate m_seq directly via a
        streaming forward iterator (mirroring what `Seq`'s nested builder
        does, but with on-disk values). */
-    template <typename> friend struct disk_backed_offsets_builder;
+    template <typename>
+    friend struct disk_backed_offsets_builder;
 
 protected:
     Seq m_seq;
diff --git a/include/util.hpp b/include/util.hpp
index e45574f..c29e748 100644
--- a/include/util.hpp
+++ b/include/util.hpp
@@ -221,20 +221,18 @@ static inline uint64_t get_seed_for_hash_function(build_configuration const& bui
     to capping threads — in that case we emit a warning naming the MPHF
     so the user knows the requested -t couldn't be honored.
 */
-static inline void configure_mphf_threads_and_partition(
-    pthash::build_configuration& mphf,             //
-    uint64_t requested_num_threads,                //
-    uint64_t ram_limit_in_GiB,                     //
-    bool verbose,                                  //
-    char const* mphf_name)                         //
+static inline void configure_mphf_threads_and_partition(pthash::build_configuration& mphf,  //
+                                                        uint64_t requested_num_threads,     //
+                                                        uint64_t ram_limit_in_GiB,          //
+                                                        bool verbose,                       //
+                                                        char const* mphf_name)              //
 {
-    constexpr uint64_t per_key_bytes = 32;          // pairs_t entry + sort slack
+    constexpr uint64_t per_key_bytes = 32;  // pairs_t entry + sort slack
     constexpr uint64_t min_avg_partition_size = uint64_t(100) * 1000;
     const uint64_t default_avg = constants::avg_partition_size;
 
     const uint64_t pthash_ram = (ram_limit_in_GiB * essentials::GiB) / 2;
-    const uint64_t per_thread =
-        pthash_ram / std::max<uint64_t>(1, requested_num_threads);
+    const uint64_t per_thread = pthash_ram / std::max<uint64_t>(1, requested_num_threads);
     const uint64_t avg_for_thread_budget = per_thread / per_key_bytes;
 
     if (avg_for_thread_budget >= default_avg) {
@@ -249,14 +247,13 @@ static inline void configure_mphf_threads_and_partition(
     } else {
         /* Pathological: not enough RAM per thread even at the floor.
            Cap threads so the floor fits. */
-        const uint64_t max_threads = std::max<uint64_t>(
-            1, pthash_ram / (per_key_bytes * min_avg_partition_size));
+        const uint64_t max_threads =
+            std::max<uint64_t>(1, pthash_ram / (per_key_bytes * min_avg_partition_size));
         if (verbose) {
             std::cout << "  --> WARNING: not enough RAM per thread for " << mphf_name
-                      << " (--ram-limit=" << ram_limit_in_GiB << " GiB, "
-                      << requested_num_threads << " requested threads): capping to "
-                      << max_threads << " threads at min partition size "
-                      << min_avg_partition_size << std::endl;
+                      << " (--ram-limit=" << ram_limit_in_GiB << " GiB, " << requested_num_threads
+                      << " requested threads): capping to " << max_threads
+                      << " threads at min partition size " << min_avg_partition_size << std::endl;
         }
         mphf.num_threads = max_threads;
         mphf.avg_partition_size = min_avg_partition_size;
diff --git a/src/builder/build.cpp b/src/builder/build.cpp
index 0f6210b..812a340 100644
--- a/src/builder/build.cpp
+++ b/src/builder/build.cpp
@@ -9,12 +9,14 @@ namespace sshash {
 namespace {
 
 inline void validate_and_normalize_build_config(build_configuration& bc, uint64_t max_k,
-                                                uint64_t max_m) {
+                                                uint64_t max_m)  //
+{
     if (bc.k == 0) throw std::runtime_error("k must be > 0");
     if (bc.k > max_k) {
         throw std::runtime_error("k must be less <= " + std::to_string(max_k) +
                                  " but got k = " + std::to_string(bc.k));
     }
+
     if (bc.m == 0) throw std::runtime_error("m must be > 0");
     if (bc.m > max_m) {
         throw std::runtime_error("m must be less <= " + std::to_string(max_m) +
@@ -22,10 +24,6 @@ inline void validate_and_normalize_build_config(build_configuration& bc, uint64_
     }
     if (bc.m > bc.k) throw std::runtime_error("m must be <= k");
 
-    /* Clamp --ram-limit to the floor. Below this, the streaming buffers
-       plus pthash's internal working memory can't usefully be made to
-       fit; rather than try to squeeze further we treat the floor as the
-       effective budget. */
     if (bc.ram_limit_in_GiB < constants::min_ram_limit_in_GiB) {
         if (bc.verbose) {
             std::cout << "  --> NOTE: --ram-limit raised from " << bc.ram_limit_in_GiB

From 033d59f85b43b6e5165b30d547c5a3af6e355c3d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 6 May 2026 17:18:49 +0000
Subject: [PATCH 28/32] always build via streaming-save; mmap the saved file
 for --check

The in-RAM build path now duplicated work: it materialized all spilled
components back into RAM after step 7 just so --check could query the
in-memory dict. Drop it. Make streaming-save the only build path:

* dictionary::build now takes output_filename; the streaming variant is
  gone and there is no longer a query-ready in-memory build.
* dictionary_builder::build_streaming_save -> build, and the
  materialize_spilled_into / materialize_compact_vector_from_file
  helpers are removed (~70 LOC).
* finalize_stats no longer branches on whether `d` is populated.
* tools/build.cpp always streams to a file. If the user passed -o, that
  path is used; otherwise a tmp file under tmp_dirname is written and
  removed on exit. --check loads the saved file via open_dictionary
  with mmap=true and runs the existing correctness checks against that.
---
 include/builder/dictionary_builder.hpp | 119 ++++---------------------
 include/dictionary.hpp                 |  19 ++--
 src/builder/build.cpp                  |  18 +---
 tools/build.cpp                        |  55 ++++++------
 4 files changed, 59 insertions(+), 152 deletions(-)

diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp
index e8cf02c..0439d7c 100644
--- a/include/builder/dictionary_builder.hpp
+++ b/include/builder/dictionary_builder.hpp
@@ -17,21 +17,9 @@
 namespace sshash {
 
 /*
-    Helper: load a serialized bits::compact_vector back from a tmp file
-    into the given in-RAM compact_vector. Used by the materializing build
-    flow (after step 7) so that --check / queries can run.
-*/
-inline void materialize_compact_vector_from_file(bits::compact_vector& cv,
-                                                 std::string const& filename) {
-    essentials::loader loader(filename.c_str());
-    loader.visit(cv);
-}
-
-/*
-    Tmp file paths for the compact_vectors that step 7 spills to disk.
-    Populated by build_sparse_and_skew_index; consumed by step 8 (either
-    materialized back into RAM for `build()`, or injected into the output
-    by `build_streaming_save()`).
+    Tmp file paths for the compact_vectors and MPHFs that step 7 spills
+    to disk. Populated by build_sparse_and_skew_index and injected into
+    the output by step 8 (stream-save).
 */
 struct spilled_components {
     std::string control_codewords_path;
@@ -72,32 +60,14 @@ struct dictionary_builder  //
     }
 
     /*
-        Build a query-ready dictionary in `d`. After this returns, all
-        spilled components and `d.m_spss.strings` are materialized in RAM
-        (peak briefly equals the index size). Use this when the caller
-        needs to query `d` post-build (e.g., `--check`).
-    */
-    void build(dictionary<Kmer, Offsets>& d, std::string const& filename) {
-        run_steps_1_through_7(d, filename);
-        do_step("step 8 (materialize spilled components to RAM)", [&]() {
-            materialize_spilled_into(d);
-            strings_builder.load_into(d.m_spss.strings);
-            strings_builder.remove_file();
-            spilled.clear_files();
-        });
-        finalize_stats(d);
-    }
-
-    /*
-        Build the dictionary and stream-save it to `output_filename` without
-        ever materializing the spilled components or `strings` in RAM.
-        After this returns, `d` is *not* query-ready. Use this when the
-        caller only needs the on-disk index file and wants to keep peak RAM
-        bounded by the build phase.
+        Build the dictionary and stream-save it to `output_filename`
+        without ever materializing the spilled components or `strings`
+        in RAM. After this returns, `d` is *not* query-ready; reload the
+        saved file via `essentials::load` / `essentials::mmap` to query.
     */
-    void build_streaming_save(dictionary<Kmer, Offsets>& d,        //
-                              std::string const& filename,         //
-                              std::string const& output_filename)  //
+    void build(dictionary<Kmer, Offsets>& d,                //
+               std::string const& filename,                 //
+               std::string const& output_filename)          //
     {
         run_steps_1_through_7(d, filename);
         do_step("step 8 (stream-save dictionary to disk)", [&]() {
@@ -166,49 +136,6 @@ struct dictionary_builder  //
     uint64_t total_time_musec;
 
 private:
-    /* Load each spilled compact_vector tmp file back into the corresponding
-       in-RAM compact_vector inside `d`. Used by the materializing build
-       flow so queries can run against `d` (e.g., during --check). */
-    void materialize_spilled_into(dictionary<Kmer, Offsets>& d) {
-        if (!spilled.control_codewords_path.empty()) {
-            materialize_compact_vector_from_file(d.m_ssi.codewords.control_codewords,
-                                                 spilled.control_codewords_path);
-        }
-        if (!spilled.mid_load_buckets_path.empty()) {
-            materialize_compact_vector_from_file(d.m_ssi.mid_load_buckets,
-                                                 spilled.mid_load_buckets_path);
-        }
-        if (!spilled.heavy_load_buckets_path.empty()) {
-            materialize_compact_vector_from_file(d.m_ssi.ski.heavy_load_buckets,
-                                                 spilled.heavy_load_buckets_path);
-        }
-        /* Reload the spilled MPHFs back into RAM so queries work. */
-        if (!spilled.codewords_mphf_path.empty()) {
-            essentials::loader loader(spilled.codewords_mphf_path.c_str());
-            loader.visit(d.m_ssi.codewords.mphf);
-        }
-        const std::size_t num_part =
-            std::max(spilled.skew_positions_paths.size(), spilled.skew_mphfs_paths.size());
-        if (num_part > 0) {
-            std::vector<bits::compact_vector> positions_vec(num_part);
-            std::vector<kmers_pthash_type<Kmer>> mphfs_vec(num_part);
-            for (std::size_t i = 0; i != spilled.skew_positions_paths.size(); ++i) {
-                if (!spilled.skew_positions_paths[i].empty()) {
-                    materialize_compact_vector_from_file(positions_vec[i],
-                                                         spilled.skew_positions_paths[i]);
-                }
-            }
-            for (std::size_t i = 0; i != spilled.skew_mphfs_paths.size(); ++i) {
-                if (!spilled.skew_mphfs_paths[i].empty()) {
-                    essentials::loader loader(spilled.skew_mphfs_paths[i].c_str());
-                    loader.visit(mphfs_vec[i]);
-                }
-            }
-            d.m_ssi.ski.positions = std::move(positions_vec);
-            d.m_ssi.ski.mphfs = std::move(mphfs_vec);
-        }
-    }
-
     void run_steps_1_through_7(dictionary<Kmer, Offsets>& d, std::string const& filename) {
         d.m_k = build_config.k;
         d.m_m = build_config.m;
@@ -274,28 +201,18 @@ struct dictionary_builder  //
         });
     }
 
-    void finalize_stats(dictionary<Kmer, Offsets>& d, std::string const& saved_path = "") {
-        /* For the materialize-to-RAM flow `d` is fully populated and we
-           can call `d.print_space_breakdown()` / `d.num_bits()` directly.
-           For the streaming-save flow `d`'s spilled components are empty
-           placeholders, so we read the on-disk index file's size for the
-           total via `std::filesystem::file_size` — direct OS stat, no
-           recomputation. */
-        const bool d_is_populated = d.m_spss.strings.num_bits() > 0;
+    void finalize_stats(dictionary<Kmer, Offsets>& d, std::string const& saved_path) {
+        /* `d`'s spilled components are empty placeholders post stream-save,
+           so read the on-disk index file's size via std::filesystem::file_size
+           rather than recomputing from `d`. */
         uint64_t num_bytes = 0;
-        if (d_is_populated) {
-            num_bytes = (d.num_bits() + 7) / 8;
-        } else if (!saved_path.empty()) {
-            std::error_code ec;
-            const auto sz = std::filesystem::file_size(saved_path, ec);
-            if (!ec) num_bytes = static_cast<uint64_t>(sz);
-        }
+        std::error_code ec;
+        const auto sz = std::filesystem::file_size(saved_path, ec);
+        if (!ec) num_bytes = static_cast<uint64_t>(sz);
 
         if (build_config.verbose) {
             print_time(total_time_musec, "total time");
-            if (d_is_populated) {
-                d.print_space_breakdown();
-            } else if (num_bytes > 0) {
+            if (num_bytes > 0) {
                 std::cout << "total index size: " << num_bytes << " [B] -- "
                           << essentials::convert(num_bytes, essentials::MB) << " [MB]\n";
                 std::cout << "  total: "
diff --git a/include/dictionary.hpp b/include/dictionary.hpp
index 7790efb..e553b6e 100644
--- a/include/dictionary.hpp
+++ b/include/dictionary.hpp
@@ -25,19 +25,16 @@ struct dictionary  //
         , m_m(0)
         , m_canonical(false) {}
 
-    /* Build from input file. After this returns, `*this` is query-ready. */
-    void build(std::string const& input_filename, build_configuration const& build_config);
-
     /*
-        Build from input file and stream-save the resulting dictionary to
-        `output_filename`. The strings bit-vector is never materialized in
-        RAM during construction, so peak RAM is bounded by the build phase
-        only. After this returns, `*this` is *not* query-ready
-        (`m_spss.strings` is empty); reload via `essentials::load` to query.
+        Build from input file, streaming the resulting dictionary to
+        `output_filename` as it goes. The strings bit-vector and the
+        sparse/skew components are never fully materialized in RAM during
+        construction, so peak RAM is bounded by the build phase only.
+        After this returns, `*this` is *not* query-ready; load the saved
+        index back via `essentials::load` / `essentials::mmap` to query.
     */
-    void build_streaming_save(std::string const& input_filename,
-                              build_configuration const& build_config,
-                              std::string const& output_filename);
+    void build(std::string const& input_filename, build_configuration const& build_config,
+               std::string const& output_filename);
 
     essentials::version_number vnum() const { return m_vnum; }
     uint64_t num_kmers() const { return m_num_kmers; }
diff --git a/src/builder/build.cpp b/src/builder/build.cpp
index 812a340..e4ee59a 100644
--- a/src/builder/build.cpp
+++ b/src/builder/build.cpp
@@ -37,24 +37,14 @@ inline void validate_and_normalize_build_config(build_configuration& bc, uint64_
 }  // namespace
 
 template <typename Kmer, typename Offsets>
-void dictionary<Kmer, Offsets>::build(std::string const& filename,
-                                      build_configuration const& build_config)  //
+void dictionary<Kmer, Offsets>::build(std::string const& input_filename,
+                                      build_configuration const& build_config,
+                                      std::string const& output_filename)  //
 {
     build_configuration bc = build_config;
     validate_and_normalize_build_config(bc, Kmer::max_k, Kmer::max_m);
     dictionary_builder<Kmer, Offsets> builder(bc);
-    builder.build(*this, filename);
-}
-
-template <typename Kmer, typename Offsets>
-void dictionary<Kmer, Offsets>::build_streaming_save(std::string const& input_filename,
-                                                     build_configuration const& build_config,
-                                                     std::string const& output_filename)  //
-{
-    build_configuration bc = build_config;
-    validate_and_normalize_build_config(bc, Kmer::max_k, Kmer::max_m);
-    dictionary_builder<Kmer, Offsets> builder(bc);
-    builder.build_streaming_save(*this, input_filename, output_filename);
+    builder.build(*this, input_filename, output_filename);
 }
 
 }  // namespace sshash
diff --git a/tools/build.cpp b/tools/build.cpp
index fbc7d1d..e6e3f67 100644
--- a/tools/build.cpp
+++ b/tools/build.cpp
@@ -76,36 +76,39 @@ int build(int argc, char** argv) {
     bool check = parser.get<bool>("check");
     bool has_output = parser.parsed("output_filename");
 
-    dictionary_type dict;
-
-    if (has_output && !check) {
-        /* Streaming-save path: keeps peak RAM bounded by the build phase
-           (the strings bit-vector is never fully in RAM). After this returns
-           `dict` is not query-ready; reload from disk to query. */
-        auto output_filename = parser.get<std::string>("output_filename");
-        essentials::logger("building data structure (streaming save)...");
-        dict.build_streaming_save(input_filename, build_config, output_filename);
-        essentials::logger("DONE");
+    /* Always build via the streaming-save path: peak RAM is bounded by
+       the build phase only. If the caller didn't pass -o, write to a
+       tmp file in `tmp_dirname` and delete it after the build (or after
+       the --check verification). */
+    std::string output_filename;
+    if (has_output) {
+        output_filename = parser.get<std::string>("output_filename");
     } else {
-        essentials::logger("building data structure...");
-        dict.build(input_filename, build_config);
+        std::stringstream ss;
+        ss << build_config.tmp_dirname << "/sshash.tmp.run_"
+           << pthash::clock_type::now().time_since_epoch().count() << ".index.bin";
+        output_filename = ss.str();
+    }
 
-        if (check) {
-            check_correctness_lookup_access(dict, input_filename);
-            check_correctness_navigational_kmer_query(dict, input_filename);
-            check_correctness_navigational_string_query(dict);
-            if (build_config.weighted) check_correctness_weights(dict, input_filename);
-            check_correctness_kmer_iterator(dict);
-            check_correctness_string_iterator(dict);
-        }
+    {
+        dictionary_type dict;
+        essentials::logger("building data structure...");
+        dict.build(input_filename, build_config, output_filename);
+        essentials::logger("DONE");
+    }
 
-        if (has_output) {
-            auto output_filename = parser.get<std::string>("output_filename");
-            essentials::logger("saving data structure to disk...");
-            essentials::save(dict, output_filename.c_str());
-            essentials::logger("DONE");
-        }
+    if (check) {
+        dictionary_type dict;
+        open_dictionary(dict, output_filename, /*mmap=*/true, build_config.verbose);
+        check_correctness_lookup_access(dict, input_filename);
+        check_correctness_navigational_kmer_query(dict, input_filename);
+        check_correctness_navigational_string_query(dict);
+        if (build_config.weighted) check_correctness_weights(dict, input_filename);
+        check_correctness_kmer_iterator(dict);
+        check_correctness_string_iterator(dict);
     }
 
+    if (!has_output) std::remove(output_filename.c_str());
+
     return 0;
 }

From 8abeb704f02a5d3489e2eb04dd688d110fa07248 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 6 May 2026 19:24:32 +0200
Subject: [PATCH 29/32] minor

---
 include/builder/buffered_record_stream.hpp | 7 ++-----
 include/builder/dictionary_builder.hpp     | 9 ++++-----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/include/builder/buffered_record_stream.hpp b/include/builder/buffered_record_stream.hpp
index 382505c..28c72de 100644
--- a/include/builder/buffered_record_stream.hpp
+++ b/include/builder/buffered_record_stream.hpp
@@ -44,14 +44,11 @@ struct buffered_record_stream {
 
     /* Open `filename` for forward reading; optionally seek to byte
        `start_byte` before priming the read window. */
-    void open(std::string const& filename,
-              uint64_t buffer_records = default_buffer_records,
+    void open(std::string const& filename, uint64_t buffer_records = default_buffer_records,
               std::streamoff start_byte = 0) {
         m_buf.resize(std::max<uint64_t>(1, buffer_records));
         m_in.open(filename, std::ifstream::binary);
-        if (!m_in.is_open()) {
-            throw std::runtime_error("cannot open file '" + filename + "'");
-        }
+        if (!m_in.is_open()) { throw std::runtime_error("cannot open file '" + filename + "'"); }
         if (start_byte != 0) m_in.seekg(start_byte, std::ios::beg);
         m_pos = 0;
         m_size = 0;
diff --git a/include/builder/dictionary_builder.hpp b/include/builder/dictionary_builder.hpp
index 0439d7c..97de32e 100644
--- a/include/builder/dictionary_builder.hpp
+++ b/include/builder/dictionary_builder.hpp
@@ -65,9 +65,9 @@ struct dictionary_builder  //
         in RAM. After this returns, `d` is *not* query-ready; reload the
         saved file via `essentials::load` / `essentials::mmap` to query.
     */
-    void build(dictionary<Kmer, Offsets>& d,                //
-               std::string const& filename,                 //
-               std::string const& output_filename)          //
+    void build(dictionary<Kmer, Offsets>& d,        //
+               std::string const& filename,         //
+               std::string const& output_filename)  //
     {
         run_steps_1_through_7(d, filename);
         do_step("step 8 (stream-save dictionary to disk)", [&]() {
@@ -215,8 +215,7 @@ struct dictionary_builder  //
             if (num_bytes > 0) {
                 std::cout << "total index size: " << num_bytes << " [B] -- "
                           << essentials::convert(num_bytes, essentials::MB) << " [MB]\n";
-                std::cout << "  total: "
-                          << (num_kmers > 0 ? (8.0 * num_bytes) / num_kmers : 0.0)
+                std::cout << "  total: " << (num_kmers > 0 ? (8.0 * num_bytes) / num_kmers : 0.0)
                           << " [bits/kmer]" << std::endl;
             }
         }

From 5aff9f7c4e3a8aa8e118799078ab22d29c608a30 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 6 May 2026 17:30:27 +0000
Subject: [PATCH 30/32] docs: add build-algorithm.md describing the streaming
 build pipeline

Walks through the eight build steps, what each one produces, and where
each intermediate lives between steps. Documents the two mechanisms that
together cap peak RSS at --ram-limit:

* every input-size-scaling intermediate is spilled to disk,
* every working buffer is sized as a fixed fraction of ram_limit, with
  the fractions tabulated.
---
 build-algorithm.md | 226 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 226 insertions(+)
 create mode 100644 build-algorithm.md

diff --git a/build-algorithm.md b/build-algorithm.md
new file mode 100644
index 0000000..992e2ff
--- /dev/null
+++ b/build-algorithm.md
@@ -0,0 +1,226 @@
+# SSHash build algorithm
+
+This note describes how `sshash build` constructs a dictionary while keeping
+peak resident memory bounded by the user-supplied `--ram-limit` (in GiB).
+
+The design has two ideas, applied uniformly:
+
+1. **Spill, don't accumulate.** Every intermediate that grows with the input
+   size is written to a tmp file under `--tmp-dirname` rather than held in a
+   `std::vector` / bit-vector in RAM. Producers append through a small write
+   buffer; consumers re-read through a small read buffer
+   (`buffered_record_stream<T>`).
+2. **Cap working buffers at a fraction of `--ram-limit`.** Buffers that live
+   only inside one step are sized as `ram_limit_in_GiB · GiB / N` (with `N`
+   typically 2 or 8). The constants are picked so that even when several
+   buffers are alive at the same time across overlapping steps, their sum
+   stays under the user budget while heap fragmentation across step
+   transitions is absorbed.
+
+The build never materializes the final index in RAM. Instead, step 8
+streams it directly to the user-supplied output file (or a tmp file, deleted
+on exit, when the user did not pass `-o`). To run `--check`, `tools/build.cpp`
+mmaps the saved file and runs the correctness queries against the mmap'd
+dictionary.
+
+---
+
+## Pipeline overview
+
+The orchestration is in `include/builder/dictionary_builder.hpp`
+(`run_steps_1_through_7` + `build`). Per-step details are in
+`src/builder/{encode_strings,compute_minimizer_tuples,build_sparse_and_skew_index}.cpp`.
+
+| Step | What it produces | Where it lives between steps |
+|------|------------------|------------------------------|
+| 1    | Encoded `strings` bit-vector + `strings_offsets` | tmp files (`disk_backed_strings`, `disk_backed_offsets_builder`) |
+| 1.1  | Compressed weights (only if `--weighted`) | `weights::builder` (in-RAM, bounded by run-length structure) |
+| 2    | Per-thread sorted runs of `minimizer_tuple` | tmp files, one per flushed buffer |
+| 3    | Single sorted run of all `minimizer_tuple`s | tmp file (k-way external merge) |
+| 4    | Minimizers MPHF F | tmp file (spilled at end of step 5) |
+| 5    | Minimizer values replaced by F(minimizer); buffers re-flushed in F-order | new sorted runs, tmp files |
+| 6    | Single sorted run keyed by F(minimizer) | tmp file |
+| 7.1  | Sparse-index components (`control_codewords`, `mid_load_buckets`) | tmp files |
+| 7.2  | Skew-index components (`heavy_load_buckets`, per-partition MPHFs and `positions`) | tmp files |
+| 8    | Final on-disk index file | streamed to output, tmp files removed |
+
+After step 8 the dictionary object `d` is **not** query-ready: the spilled
+components were copied into the output file but never read back into `d`.
+`finalize_stats` reports `index_size_in_bytes` via `std::filesystem::file_size`
+on the saved path.
+
+---
+
+## Step 1 — encode strings (`encode_strings.cpp`)
+
+Iterates the input FASTA, producing the 2-bit-packed `strings` bit-vector
+and the `strings_offsets` array (one offset per sequence + a sentinel).
+Both go through disk-backed builders:
+
+- **`disk_backed_strings`**: appends 2-bit characters into a small in-RAM
+  word buffer; flushes the buffer to a tmp file when full.
+- **`disk_backed_offsets_builder<Offsets>`**: appends one `uint64_t` offset
+  per sequence into a small write buffer; flushes to a tmp file.
+
+In-RAM footprint of step 1 is `O(buffer)` regardless of input size.
+
+## Step 1.1 — weights (optional)
+
+Only runs with `--weighted`. The weights builder uses run-length encoding:
+its in-RAM size is proportional to the number of distinct weights, not to
+the number of k-mers.
+
+## Step 2 — compute minimizer tuples (`compute_minimizer_tuples.cpp`)
+
+Each thread streams its assigned slice of the input via the disk-backed
+strings/offsets readers and emits `minimizer_tuple` records into a private
+in-RAM buffer:
+
+```cpp
+buffer_size = (ram_limit · GiB) / (2 · sizeof(minimizer_tuple) · num_threads)
+```
+
+When the buffer fills, the thread sorts it in parallel and flushes a sorted
+run to a tmp file (`minimizers_tuples::sort_and_flush`). The factor of 2
+in the denominator leaves headroom for `std::sort`'s allocations and
+inter-thread contention; the per-thread split makes the total in-RAM tuple
+buffer ≈ `ram_limit / 2`.
+
+## Step 3 — k-way external merge (`minimizers_tuples::merge`)
+
+The N tmp files from step 2 are merged into a single sorted run via a
+**winner-tree-based external-merge iterator** (`file_merging_iterator<T>`).
+Each input file is read through a `buffered_record_stream<minimizer_tuple>`
+with `default_buffer_records = 4096` records, so the total in-RAM merge
+state is `N · 4096 · sizeof(minimizer_tuple)` ≈ tens of MB even for very
+many runs. The output is written through a small `std::ofstream` buffer.
+
+When N == 1 the merge degenerates to a rename + a single streaming scan to
+collect bucket statistics; same RAM bound.
+
+## Step 4 — build minimizers MPHF
+
+Builds an external-memory partitioned PHF over distinct minimizers, using
+pthash's `build_in_external_memory`. The minimizers are streamed from the
+sorted run via `streaming_minimizers_iterator` (one buffered ifstream),
+and pthash spills its own working hashes under `tmp_dirname` capped by
+`mphf_build_config.ram = ram_limit / 2`.
+
+## Step 5 — replace minimizer values with F(minimizer)
+
+The merged file is re-read in fixed-size blocks; each block is hashed in
+parallel and re-flushed as a new sorted run. Two RAM caps are combined:
+
+```cpp
+RAM_available    = ram_limit · GiB − sizeof(F) − offsets_builder.num_bytes()
+buffer_unbounded = RAM_available / (3 · sizeof(minimizer_tuple))   // 3× = read+sort scratch+write
+buffer_cap       = (ram_limit · GiB / 8) / sizeof(minimizer_tuple)
+buffer_size      = min(buffer_unbounded, buffer_cap)
+```
+
+The `/ 8` cap exists because step 5 leaves heap pages dirtied that linger
+into later steps' allocations; capping at one-eighth of the budget keeps
+the cumulative RSS under `ram_limit` when steps 6/7 start allocating.
+
+After step 5, the minimizers MPHF F is **spilled to disk** and the in-RAM
+copy is freed: subsequent steps only ever use F(minimizer) values, not F
+itself.
+
+## Step 6 — re-merge in F-order
+
+Same machinery as step 3, applied to the new sorted runs from step 5.
+
+## Step 7.1 — sparse index (`build_sparse_and_skew_index.cpp`)
+
+Constructs `control_codewords` and `mid_load_buckets`. Both are produced as
+on-disk `bits::compact_vector` files via `streaming_compact_vector_writer`,
+so neither is ever materialized in RAM.
+
+## Step 7.2 — skew index
+
+The most RAM-sensitive step; it has three internal phases, all
+disk-backed:
+
+- **Phase B (k-mer extraction requests).** Heavy-bucket entries become
+  `kmer_extraction_request` records. They are external-sorted by
+  `starting_pos` so that k-mer extraction reduces to a single forward
+  scan over `strings`. The request buffer is capped at
+  `ram_limit / 8 / sizeof(kmer_extraction_request)`; flushed runs are
+  merged with `file_merging_iterator`.
+- **Per-partition kmer files.** While walking `strings` in request-sorted
+  order, each extracted k-mer is written to its partition's tmp file via
+  a buffered writer; this file is the input to the partition's MPHF.
+- **Phase C (per-partition MPHF + `positions`).** For each skew partition:
+  1. Build the partition MPHF with pthash external-memory (`ram = ram_limit / 2`,
+     iterator: `skew_partition_kmer_iterator` over the partition's tmp file).
+  2. Stream-read the partition file again, emit `(F(kmer), pos_in_bucket)`
+     tuples; external-sort them in `ram_limit / 8`-sized buffers and merge.
+  3. Pack the sorted tuples into the partition's `positions`
+     compact_vector via `streaming_compact_vector_writer`.
+
+  Only the freshly-built MPHF for the *current* partition lives in RAM
+  during phase C; once spilled (`essentials::save`), it is freed before the
+  next partition starts. `positions` is fully on-disk.
+
+## Step 8 — stream-save (`include/builder/streaming_save.hpp`)
+
+The dictionary `d` is walked by `essentials::saver`, but every spilled
+component is intercepted via an **address+type-keyed substitution map**
+(`typed_address_sub`): when the saver visits a registered (address, type)
+pair, it appends the bytes of the corresponding tmp file straight into the
+output stream instead of reading from `d`. The strings bit-vector goes
+through the same mechanism via `disk_backed_strings`.
+
+Concretely, the registered substitutions are:
+
+| Component                         | Source tmp file                        |
+|-----------------------------------|----------------------------------------|
+| `m_ssi.codewords.control_codewords` | step 7.1                                |
+| `m_ssi.mid_load_buckets`            | step 7.1                                |
+| `m_ssi.ski.heavy_load_buckets`      | step 7.2 phase B                        |
+| `m_ssi.codewords.mphf`              | step 5 spill                            |
+| `m_ssi.ski.positions[i]`            | step 7.2 phase C, per partition         |
+| `m_ssi.ski.mphfs[i]`                | step 7.2 phase C, per partition         |
+| `m_spss.strings`                    | step 1 (`disk_backed_strings`)          |
+
+Because the substitutions are by `(address, type)` pair, a struct's address
+coinciding with its first member's address does not cause confusion.
+
+After step 8 returns, the tmp files are removed and `finalize_stats` reads
+the saved file's size with `std::filesystem::file_size`.
+
+---
+
+## How the RAM cap is enforced — summary
+
+The on-disk index size grows with `num_kmers`. The build's **resident**
+memory does not, because every component that scales with input size is
+either:
+
+- **always on disk** (`strings`, `strings_offsets`, all sorted minimizer
+  runs, the merged minimizers file, the sparse-index compact_vectors, the
+  skew-index per-partition kmer/positions files, the codewords MPHF and
+  per-partition MPHFs), or
+- **bounded by a working buffer** sized as a fraction of `ram_limit`:
+
+  | Buffer                                 | Cap                      |
+  |----------------------------------------|--------------------------|
+  | Step 2 per-thread minimizer buffer     | `ram_limit / 2 / num_threads` |
+  | Step 5 hashing buffer                  | `min(ram/8, RAM_available/3)` |
+  | Step 7.2 kmer-request external sort    | `ram_limit / 8`          |
+  | Step 7.2 phase-C `position_tuple` sort | `ram_limit / 8`          |
+  | pthash external-memory builds          | `ram_limit / 2` (its own `--ram`) |
+  | Every disk-backed reader/writer        | `default_buffer_records ≈ 32 KiB` |
+  | Every external merge front (per run)   | `4096 · sizeof(T)`       |
+
+The fractions (`/2` for the dominant per-step buffer, `/8` for buffers
+that span step boundaries) are chosen so that overlapping allocations and
+heap fragmentation between steps stay under `ram_limit` in practice. There
+is a hard floor of `min_ram_limit_in_GiB` (enforced in
+`validate_and_normalize_build_config`) below which step 4's MPHF builder
+no longer has enough room to make progress.
+
+The result: peak RSS during the build is governed by `--ram-limit`, not by
+the input size or by the on-disk index size, and the saved index is
+identical (byte-for-byte) to one written by an in-RAM builder followed by
+`essentials::save`.

From 730f0ad3c1fb11ca86ea6e27b14c3f60d3c36a6d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 6 May 2026 17:35:38 +0000
Subject: [PATCH 31/32] docs(build-algorithm): use real CLI flag names (-g, -d)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The doc referred to --ram-limit and --tmp-dirname; the actual short
flags exposed by sshash build are -g (RAM limit in GiB) and -d (tmp
dir). The pthash "ram" reference was a programmatic config field, not
a CLI flag — clarified.
---
 build-algorithm.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/build-algorithm.md b/build-algorithm.md
index 992e2ff..c437166 100644
--- a/build-algorithm.md
+++ b/build-algorithm.md
@@ -1,16 +1,16 @@
 # SSHash build algorithm
 
 This note describes how `sshash build` constructs a dictionary while keeping
-peak resident memory bounded by the user-supplied `--ram-limit` (in GiB).
+peak resident memory bounded by the user-supplied `-g` (in GiB).
 
 The design has two ideas, applied uniformly:
 
 1. **Spill, don't accumulate.** Every intermediate that grows with the input
-   size is written to a tmp file under `--tmp-dirname` rather than held in a
+   size is written to a tmp file under `-d` (tmp dir) rather than held in a
    `std::vector` / bit-vector in RAM. Producers append through a small write
    buffer; consumers re-read through a small read buffer
    (`buffered_record_stream<T>`).
-2. **Cap working buffers at a fraction of `--ram-limit`.** Buffers that live
+2. **Cap working buffers at a fraction of `-g`.** Buffers that live
    only inside one step are sized as `ram_limit_in_GiB · GiB / N` (with `N`
    typically 2 or 8). The constants are picked so that even when several
    buffers are alive at the same time across overlapping steps, their sum
@@ -209,7 +209,7 @@ either:
   | Step 5 hashing buffer                  | `min(ram/8, RAM_available/3)` |
   | Step 7.2 kmer-request external sort    | `ram_limit / 8`          |
   | Step 7.2 phase-C `position_tuple` sort | `ram_limit / 8`          |
-  | pthash external-memory builds          | `ram_limit / 2` (its own `--ram`) |
+  | pthash external-memory builds          | `ram_limit / 2` (its own `ram` field) |
   | Every disk-backed reader/writer        | `default_buffer_records ≈ 32 KiB` |
   | Every external merge front (per run)   | `4096 · sizeof(T)`       |
 
@@ -220,7 +220,7 @@ is a hard floor of `min_ram_limit_in_GiB` (enforced in
 `validate_and_normalize_build_config`) below which step 4's MPHF builder
 no longer has enough room to make progress.
 
-The result: peak RSS during the build is governed by `--ram-limit`, not by
+The result: peak RSS during the build is governed by `-g`, not by
 the input size or by the on-disk index size, and the saved index is
 identical (byte-for-byte) to one written by an in-RAM builder followed by
 `essentials::save`.

From f19ffccab892b132d90b4e7d1647455afd51cc5a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 6 May 2026 17:45:47 +0000
Subject: [PATCH 32/32] docs(build-algorithm): rephrase 'O(buffer)' as
 'proportional to the buffer size'

---
 build-algorithm.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/build-algorithm.md b/build-algorithm.md
index c437166..7490726 100644
--- a/build-algorithm.md
+++ b/build-algorithm.md
@@ -62,7 +62,8 @@ Both go through disk-backed builders:
 - **`disk_backed_offsets_builder<Offsets>`**: appends one `uint64_t` offset
   per sequence into a small write buffer; flushes to a tmp file.
 
-In-RAM footprint of step 1 is `O(buffer)` regardless of input size.
+In-RAM footprint of step 1 is proportional to the buffer size, regardless of
+input size.
 
 ## Step 1.1 — weights (optional)