Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 23 additions & 15 deletions include/builder/build_sparse_and_skew_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,10 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
heavy_load_buckets_builder.resize(num_minimizer_positions_of_buckets_in_skew_index,
num_bits_per_offset);

d.m_ssi.begin_buckets_of_size.resize(min_size + 1, 0);

{
std::vector<uint32_t> begin_buckets_of_size;
begin_buckets_of_size.resize(min_size + 1, 0);

uint64_t curr_bucket_size = 2;
uint64_t list_id = 0;
uint64_t mid_load_buckets_size = 0;
Expand All @@ -189,7 +190,7 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
if (bucket_size > curr_bucket_size) {
while (bucket_size > curr_bucket_size) ++curr_bucket_size;
if (curr_bucket_size <= min_size) {
d.m_ssi.begin_buckets_of_size[curr_bucket_size] = mid_load_buckets_size;
begin_buckets_of_size[curr_bucket_size] = mid_load_buckets_size;
} else {
while (curr_bucket_size > upper) {
lower = upper;
Expand Down Expand Up @@ -235,6 +236,8 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
}
}
}

d.m_ssi.begin_buckets_of_size = std::move(begin_buckets_of_size);
}

control_codewords_builder.build(d.m_ssi.codewords.control_codewords);
Expand All @@ -259,8 +262,6 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
/* step 2. build skew index */
timer.start();
std::vector<uint64_t> num_kmers_in_partition(num_partitions, 0);
d.m_ssi.ski.mphfs.resize(num_partitions);
d.m_ssi.ski.positions.resize(num_partitions);

{
uint64_t partition_id = 0;
Expand Down Expand Up @@ -310,6 +311,11 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
}

{
std::vector<kmers_pthash_type<Kmer>> mphfs;
std::vector<bits::compact_vector> positions;
mphfs.resize(num_partitions);
positions.resize(num_partitions);

pthash::build_configuration mphf_build_config;
mphf_build_config.lambda =
build_config.lambda + 2.0; /* Use higher lambda here since we have less keys. */
Expand Down Expand Up @@ -360,29 +366,28 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
<< ")..." << std::endl;
}

auto& mphf = d.m_ssi.ski.mphfs[partition_id];
mphf.build_in_internal_memory(kmers.begin(), kmers.size(), mphf_build_config);
auto& F = mphfs[partition_id];
F.build_in_internal_memory(kmers.begin(), kmers.size(), mphf_build_config);

if (build_config.verbose) {
std::cout << " built mphs[" << partition_id << "] for " << kmers.size()
<< " kmers; bits/key = "
<< static_cast<double>(mphf.num_bits()) / mphf.num_keys()
<< std::endl;
<< static_cast<double>(F.num_bits()) / F.num_keys() << std::endl;
}

for (uint64_t i = 0; i != kmers.size(); ++i) {
Kmer kmer = kmers[i];
uint64_t pos = mphf(kmer);
uint64_t pos = F(kmer);
uint32_t pos_in_bucket = positions_in_bucket[i];
cvb_positions.set(pos, pos_in_bucket);
}
auto& positions = d.m_ssi.ski.positions[partition_id];
cvb_positions.build(positions);
auto& P = positions[partition_id];
cvb_positions.build(positions[partition_id]);

if (build_config.verbose) {
std::cout << " built positions[" << partition_id << "] for "
<< positions.size() << " kmers; bits/key = "
<< (positions.num_bytes() * 8.0) / positions.size() << std::endl;
std::cout << " built positions[" << partition_id << "] for " << P.size()
<< " kmers; bits/key = " << (P.num_bytes() * 8.0) / P.size()
<< std::endl;
}
}

Expand Down Expand Up @@ -437,6 +442,9 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
}
}
assert(partition_id == num_partitions - 1);

d.m_ssi.ski.mphfs = std::move(mphfs);
d.m_ssi.ski.positions = std::move(positions);
}

timer.stop();
Expand Down
13 changes: 5 additions & 8 deletions include/sparse_and_skew_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,7 @@ namespace sshash {
template <typename Kmer>
struct skew_index //
{
skew_index() {
mphfs.resize(0);
positions.resize(0);
}
skew_index() {}

/* Returns the number of kmers in the index. */
uint64_t print_info() const {
Expand Down Expand Up @@ -47,7 +44,7 @@ struct skew_index //
}

uint64_t num_bits() const {
uint64_t n = (2 * sizeof(size_t)) * 8; /* for std::vector::size */
uint64_t n = (2 * sizeof(size_t)) * 8; /* for span' size */
for (uint64_t partition_id = 0; partition_id != mphfs.size(); ++partition_id) {
auto const& f = mphfs[partition_id];
auto const& p = positions[partition_id];
Expand All @@ -66,8 +63,8 @@ struct skew_index //
visit_impl(visitor, *this);
}

std::vector<kmers_pthash_type<Kmer>> mphfs;
std::vector<bits::compact_vector> positions;
essentials::owning_span<kmers_pthash_type<Kmer>> mphfs;
essentials::owning_span<bits::compact_vector> positions;
bits::compact_vector heavy_load_buckets;

private:
Expand Down Expand Up @@ -156,7 +153,7 @@ struct sparse_and_skew_index //
}

minimizers_control_map codewords;
std::vector<uint32_t> begin_buckets_of_size;
essentials::owning_span<uint32_t> begin_buckets_of_size;
bits::compact_vector mid_load_buckets;
skew_index<Kmer> ski;

Expand Down
5 changes: 3 additions & 2 deletions tools/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ void random_kmer(char* kmer, uint64_t k) {
}

template <typename Dict>
void load_dictionary(Dict& dict, std::string const& index_filename, bool verbose) {
const uint64_t num_bytes_read = essentials::load(dict, index_filename.c_str());
void open_dictionary(Dict& dict, std::string const& index_filename, bool mmap, bool verbose) {
const uint64_t num_bytes_read = mmap ? essentials::mmap(dict, index_filename.c_str())
: essentials::load(dict, index_filename.c_str());
if (verbose) {
std::cout << "total index size: " << num_bytes_read << " [B] -- "
<< essentials::convert(num_bytes_read, essentials::MB) << " [MB] ("
Expand Down
5 changes: 4 additions & 1 deletion tools/query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,19 @@ int query(int argc, char** argv) {
"Use this option if more the one DNA line must be parsed after each header."
" Only valid for FASTA files (not FASTQ).",
"--multiline", false, true);
parser.add("mmap", "Memory-map from file on disk rather than loading in RAM.", "--mmap", false,
true);
parser.add("verbose", "Verbose output.", "--verbose", false, true);
if (!parser.parse()) return 0;

auto index_filename = parser.get<std::string>("index_filename");
auto query_filename = parser.get<std::string>("query_filename");
bool mmap = parser.get<bool>("mmap");
bool verbose = parser.get<bool>("verbose");
bool multiline = parser.get<bool>("multiline");

dictionary_type dict;
load_dictionary(dict, index_filename, verbose);
open_dictionary(dict, index_filename, mmap, verbose);

essentials::logger("performing queries from file '" + query_filename + "'...");
essentials::timer<std::chrono::high_resolution_clock, std::chrono::milliseconds> t;
Expand Down
10 changes: 8 additions & 2 deletions tools/sshash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,15 @@ using namespace sshash;
int check(int argc, char** argv) {
cmd_line_parser::parser parser(argc, argv);
parser.add("index_filename", "Must be a file generated with the tool 'build'.", "-i", true);
parser.add("mmap", "Memory-map from file on disk rather than loading in RAM.", "--mmap", false,
true);
parser.add("verbose", "Verbose output.", "--verbose", false, true);
if (!parser.parse()) return 0;
auto index_filename = parser.get<std::string>("index_filename");
bool mmap = parser.get<bool>("mmap");
bool verbose = parser.get<bool>("verbose");
dictionary_type dict;
load_dictionary(dict, index_filename, verbose);
open_dictionary(dict, index_filename, mmap, verbose);
check_dictionary(dict);
check_correctness_navigational_string_query(dict);
check_correctness_kmer_iterator(dict);
Expand All @@ -36,12 +39,15 @@ int check(int argc, char** argv) {
int bench(int argc, char** argv) {
cmd_line_parser::parser parser(argc, argv);
parser.add("index_filename", "Must be a file generated with the tool 'build'.", "-i", true);
parser.add("mmap", "Memory-map from file on disk rather than loading in RAM.", "--mmap", false,
true);
parser.add("verbose", "Verbose output.", "--verbose", false, true);
if (!parser.parse()) return 0;
auto index_filename = parser.get<std::string>("index_filename");
bool mmap = parser.get<bool>("mmap");
bool verbose = parser.get<bool>("verbose");
dictionary_type dict;
load_dictionary(dict, index_filename, verbose);
open_dictionary(dict, index_filename, mmap, verbose);

essentials::json_lines perf_stats;
perf_stats.add("index_filename", index_filename.c_str());
Expand Down
Loading