From c9cb04b9d15459956128db8e22f633ddb5c36452 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Mon, 9 Mar 2026 20:40:33 +0100 Subject: [PATCH 1/2] allow memory mapping --- external/pthash | 2 +- .../builder/build_sparse_and_skew_index.cpp | 38 +++++++++++-------- include/sparse_and_skew_index.hpp | 13 +++---- tools/common.hpp | 5 ++- tools/query.cpp | 5 ++- tools/sshash.cpp | 10 ++++- 6 files changed, 44 insertions(+), 29 deletions(-) diff --git a/external/pthash b/external/pthash index 1bbe0af..d4ada34 160000 --- a/external/pthash +++ b/external/pthash @@ -1 +1 @@ -Subproject commit 1bbe0af07927de86334857933a1ba846e0623152 +Subproject commit d4ada34bf750eec01629d3511e58e5c4db1c1c6a diff --git a/include/builder/build_sparse_and_skew_index.cpp b/include/builder/build_sparse_and_skew_index.cpp index c11c056..e9f03b6 100644 --- a/include/builder/build_sparse_and_skew_index.cpp +++ b/include/builder/build_sparse_and_skew_index.cpp @@ -170,9 +170,10 @@ void dictionary_builder::build_sparse_and_skew_index( heavy_load_buckets_builder.resize(num_minimizer_positions_of_buckets_in_skew_index, num_bits_per_offset); - d.m_ssi.begin_buckets_of_size.resize(min_size + 1, 0); - { + std::vector begin_buckets_of_size; + begin_buckets_of_size.resize(min_size + 1, 0); + uint64_t curr_bucket_size = 2; uint64_t list_id = 0; uint64_t mid_load_buckets_size = 0; @@ -189,7 +190,7 @@ void dictionary_builder::build_sparse_and_skew_index( if (bucket_size > curr_bucket_size) { while (bucket_size > curr_bucket_size) ++curr_bucket_size; if (curr_bucket_size <= min_size) { - d.m_ssi.begin_buckets_of_size[curr_bucket_size] = mid_load_buckets_size; + begin_buckets_of_size[curr_bucket_size] = mid_load_buckets_size; } else { while (curr_bucket_size > upper) { lower = upper; @@ -235,6 +236,8 @@ void dictionary_builder::build_sparse_and_skew_index( } } } + + d.m_ssi.begin_buckets_of_size = std::move(begin_buckets_of_size); } control_codewords_builder.build(d.m_ssi.codewords.control_codewords); @@ -259,8 +262,6 @@ void dictionary_builder::build_sparse_and_skew_index( /* step 2. build skew index */ timer.start(); std::vector num_kmers_in_partition(num_partitions, 0); - d.m_ssi.ski.mphfs.resize(num_partitions); - d.m_ssi.ski.positions.resize(num_partitions); { uint64_t partition_id = 0; @@ -310,6 +311,11 @@ void dictionary_builder::build_sparse_and_skew_index( } { + std::vector> mphfs; + std::vector positions; + mphfs.resize(num_partitions); + positions.resize(num_partitions); + pthash::build_configuration mphf_build_config; mphf_build_config.lambda = build_config.lambda + 2.0; /* Use higher lambda here since we have less keys. */ @@ -360,29 +366,28 @@ void dictionary_builder::build_sparse_and_skew_index( << ")..." << std::endl; } - auto& mphf = d.m_ssi.ski.mphfs[partition_id]; - mphf.build_in_internal_memory(kmers.begin(), kmers.size(), mphf_build_config); + auto& F = mphfs[partition_id]; + F.build_in_internal_memory(kmers.begin(), kmers.size(), mphf_build_config); if (build_config.verbose) { std::cout << " built mphs[" << partition_id << "] for " << kmers.size() << " kmers; bits/key = " - << static_cast(mphf.num_bits()) / mphf.num_keys() - << std::endl; + << static_cast(F.num_bits()) / F.num_keys() << std::endl; } for (uint64_t i = 0; i != kmers.size(); ++i) { Kmer kmer = kmers[i]; - uint64_t pos = mphf(kmer); + uint64_t pos = F(kmer); uint32_t pos_in_bucket = positions_in_bucket[i]; cvb_positions.set(pos, pos_in_bucket); } - auto& positions = d.m_ssi.ski.positions[partition_id]; - cvb_positions.build(positions); + auto& P = positions[partition_id]; + cvb_positions.build(positions[partition_id]); if (build_config.verbose) { - std::cout << " built positions[" << partition_id << "] for " - << positions.size() << " kmers; bits/key = " - << (positions.num_bytes() * 8.0) / positions.size() << std::endl; + std::cout << " built positions[" << partition_id << "] for " << P.size() + << " kmers; bits/key = " << (P.num_bytes() * 8.0) / P.size() + << std::endl; } } @@ -437,6 +442,9 @@ void dictionary_builder::build_sparse_and_skew_index( } } assert(partition_id == num_partitions - 1); + + d.m_ssi.ski.mphfs = std::move(mphfs); + d.m_ssi.ski.positions = std::move(positions); } timer.stop(); diff --git a/include/sparse_and_skew_index.hpp b/include/sparse_and_skew_index.hpp index d2ede11..f30f860 100644 --- a/include/sparse_and_skew_index.hpp +++ b/include/sparse_and_skew_index.hpp @@ -8,10 +8,7 @@ namespace sshash { template struct skew_index // { - skew_index() { - mphfs.resize(0); - positions.resize(0); - } + skew_index() {} /* Returns the number of kmers in the index. */ uint64_t print_info() const { @@ -47,7 +44,7 @@ struct skew_index // } uint64_t num_bits() const { - uint64_t n = (2 * sizeof(size_t)) * 8; /* for std::vector::size */ + uint64_t n = (2 * sizeof(size_t)) * 8; /* for span' size */ for (uint64_t partition_id = 0; partition_id != mphfs.size(); ++partition_id) { auto const& f = mphfs[partition_id]; auto const& p = positions[partition_id]; @@ -66,8 +63,8 @@ struct skew_index // visit_impl(visitor, *this); } - std::vector> mphfs; - std::vector positions; + essentials::owning_span> mphfs; + essentials::owning_span positions; bits::compact_vector heavy_load_buckets; private: @@ -156,7 +153,7 @@ struct sparse_and_skew_index // } minimizers_control_map codewords; - std::vector begin_buckets_of_size; + essentials::owning_span begin_buckets_of_size; bits::compact_vector mid_load_buckets; skew_index ski; diff --git a/tools/common.hpp b/tools/common.hpp index 42acb52..f493ff1 100644 --- a/tools/common.hpp +++ b/tools/common.hpp @@ -17,8 +17,9 @@ void random_kmer(char* kmer, uint64_t k) { } template -void load_dictionary(Dict& dict, std::string const& index_filename, bool verbose) { - const uint64_t num_bytes_read = essentials::load(dict, index_filename.c_str()); +void open_dictionary(Dict& dict, std::string const& index_filename, bool mmap, bool verbose) { + const uint64_t num_bytes_read = mmap ? essentials::mmap(dict, index_filename.c_str()) + : essentials::load(dict, index_filename.c_str()); if (verbose) { std::cout << "total index size: " << num_bytes_read << " [B] -- " << essentials::convert(num_bytes_read, essentials::MB) << " [MB] (" diff --git a/tools/query.cpp b/tools/query.cpp index 0ae64ff..de4a49b 100644 --- a/tools/query.cpp +++ b/tools/query.cpp @@ -14,16 +14,19 @@ int query(int argc, char** argv) { "Use this option if more the one DNA line must be parsed after each header." " Only valid for FASTA files (not FASTQ).", "--multiline", false, true); + parser.add("mmap", "Memory-map from file on disk rather than loading in RAM.", "--mmap", false, + true); parser.add("verbose", "Verbose output.", "--verbose", false, true); if (!parser.parse()) return 0; auto index_filename = parser.get("index_filename"); auto query_filename = parser.get("query_filename"); + bool mmap = parser.get("mmap"); bool verbose = parser.get("verbose"); bool multiline = parser.get("multiline"); dictionary_type dict; - load_dictionary(dict, index_filename, verbose); + open_dictionary(dict, index_filename, mmap, verbose); essentials::logger("performing queries from file '" + query_filename + "'..."); essentials::timer t; diff --git a/tools/sshash.cpp b/tools/sshash.cpp index 446d974..88d490f 100644 --- a/tools/sshash.cpp +++ b/tools/sshash.cpp @@ -20,12 +20,15 @@ using namespace sshash; int check(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); parser.add("index_filename", "Must be a file generated with the tool 'build'.", "-i", true); + parser.add("mmap", "Memory-map from file on disk rather than loading in RAM.", "--mmap", false, + true); parser.add("verbose", "Verbose output.", "--verbose", false, true); if (!parser.parse()) return 0; auto index_filename = parser.get("index_filename"); + bool mmap = parser.get("mmap"); bool verbose = parser.get("verbose"); dictionary_type dict; - load_dictionary(dict, index_filename, verbose); + open_dictionary(dict, index_filename, mmap, verbose); check_dictionary(dict); check_correctness_navigational_string_query(dict); check_correctness_kmer_iterator(dict); @@ -36,12 +39,15 @@ int check(int argc, char** argv) { int bench(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); parser.add("index_filename", "Must be a file generated with the tool 'build'.", "-i", true); + parser.add("mmap", "Memory-map from file on disk rather than loading in RAM.", "--mmap", false, + true); parser.add("verbose", "Verbose output.", "--verbose", false, true); if (!parser.parse()) return 0; auto index_filename = parser.get("index_filename"); + bool mmap = parser.get("mmap"); bool verbose = parser.get("verbose"); dictionary_type dict; - load_dictionary(dict, index_filename, verbose); + open_dictionary(dict, index_filename, mmap, verbose); essentials::json_lines perf_stats; perf_stats.add("index_filename", index_filename.c_str()); From 4312737cdb4a3f9ee28b7336cf5faffc2be3f71b Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Mon, 9 Mar 2026 20:53:47 +0100 Subject: [PATCH 2/2] updated external/pthash --- external/pthash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/pthash b/external/pthash index d4ada34..e04a192 160000 --- a/external/pthash +++ b/external/pthash @@ -1 +1 @@ -Subproject commit d4ada34bf750eec01629d3511e58e5c4db1c1c6a +Subproject commit e04a1920ffeae9e7d876acd0362cab79605f7af3