Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 29 additions & 19 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,22 @@ else()
set(CONDA_BUILD FALSE)
endif()

option(SSHASH_BUILD_EXECUTABLES "Build sshash executables" ON)
MESSAGE(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
MESSAGE(STATUS "Conda build: ${CONDA_BUILD}")
MESSAGE(STATUS "Installation prefix: ${CMAKE_INSTALL_PREFIX}")
MESSAGE(STATUS "Compiling for processor: ${CMAKE_SYSTEM_PROCESSOR}")
MESSAGE(STATUS "Compiling with flags:${CMAKE_CXX_FLAGS}")
MESSAGE(STATUS "Build executables: ${SSHASH_BUILD_EXECUTABLES}")

set(Z_LIB_SOURCES
external/gz/zip_stream.cpp
)

set(CITYHASH_SOURCES
external/cityhash/cityhash.cpp
)

set(SSHASH_SOURCES
src/build.cpp
src/dictionary.cpp
Expand All @@ -79,32 +85,36 @@ set(SSHASH_INCLUDE_DIRS
# Create a static lib
add_library(sshash_static STATIC
${Z_LIB_SOURCES}
${CITYHASH_SOURCES}
${SSHASH_SOURCES}
)

target_include_directories(sshash_static PUBLIC ${SSHASH_INCLUDE_DIRS})

add_executable(sshash tools/sshash.cpp)
target_include_directories(sshash PUBLIC ${SSHASH_INCLUDE_DIRS})
target_link_libraries(sshash
z
)

# tests:
if(SSHASH_BUILD_EXECUTABLES)
add_executable(sshash tools/sshash.cpp)
target_include_directories(sshash PUBLIC ${SSHASH_INCLUDE_DIRS})
target_link_libraries(sshash
sshash_static
z
)

add_executable(test_alphabet test/test_alphabet.cpp)
target_link_libraries(test_alphabet
sshash_static
)
# tests:

add_executable(check test/check.cpp)
target_link_libraries(check
sshash_static
z
)
add_executable(test_alphabet test/test_alphabet.cpp)
target_link_libraries(test_alphabet
sshash_static
)

if (CONDA_BUILD)
install(TARGETS sshash
RUNTIME DESTINATION bin
add_executable(check test/check.cpp)
target_link_libraries(check
sshash_static
z
)

if (CONDA_BUILD)
install(TARGETS sshash
RUNTIME DESTINATION bin
)
endif()
endif()
6 changes: 5 additions & 1 deletion external/cityhash/cityhash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
#include <algorithm>
#include <string.h> // for memcpy and memset

namespace cityhash {

using namespace std;

static uint64 UNALIGNED_LOAD64(const char* p) {
Expand Down Expand Up @@ -442,4 +444,6 @@ uint128 CityHashCrc128(const char* s, size_t len) {
}
}

#endif
#endif

} // namespace cityhash
4 changes: 4 additions & 0 deletions external/cityhash/cityhash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@
#include <stdlib.h> // for size_t.
#include <utility>

namespace cityhash {

// Microsoft Visual Studio may not have stdint.h.
#if defined(_MSC_VER) && (_MSC_VER < 1600)
typedef unsigned char uint8_t;
Expand Down Expand Up @@ -112,4 +114,6 @@ void CityHashCrc256(const char* s, size_t len, uint64* result);

#endif // __SSE4_2__

} // namespace cityhash

#endif // CITY_HASH_H_
20 changes: 16 additions & 4 deletions include/buckets_statistics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,30 @@ struct buckets_statistics {
, m_num_kmers(0)
, m_num_minimizer_positions(0)
, m_max_num_kmers_in_super_kmer(0)
, m_max_bucket_size(0) {}
, m_max_bucket_size(0)
, m_max_sparse_buckets_per_size(0) {}

buckets_statistics(uint64_t num_buckets, uint64_t num_kmers, uint64_t num_minimizer_positions)
: m_num_buckets(num_buckets)
, m_num_kmers(num_kmers)
, m_num_minimizer_positions(num_minimizer_positions)
, m_max_num_kmers_in_super_kmer(0)
, m_max_bucket_size(0) //
, m_max_bucket_size(0)
, m_max_sparse_buckets_per_size(0) //
{
m_bucket_sizes.resize(MAX_BUCKET_SIZE + 1, 0);
m_total_kmers.resize(MAX_BUCKET_SIZE + 1, 0);
m_super_kmer_sizes.resize(MAX_STRING_SIZE + 1, 0);
}

void add_bucket_size(uint64_t bucket_size) {
if (bucket_size < MAX_BUCKET_SIZE + 1) { m_bucket_sizes[bucket_size] += 1; }
if (bucket_size > m_max_bucket_size) { m_max_bucket_size = bucket_size; }
if (bucket_size < MAX_BUCKET_SIZE + 1) {
m_bucket_sizes[bucket_size] += 1;
if (bucket_size > 1) {
m_max_sparse_buckets_per_size = std::max(m_max_sparse_buckets_per_size, m_bucket_sizes[bucket_size]);
}
}
m_max_bucket_size = std::max(m_max_bucket_size, bucket_size);
}

void add_num_kmers_in_super_kmer(uint64_t bucket_size,
Expand All @@ -49,6 +56,7 @@ struct buckets_statistics {
uint64_t num_minimizer_positions() const { return m_num_minimizer_positions; }
uint64_t max_num_kmers_in_super_kmer() const { return m_max_num_kmers_in_super_kmer; }
uint64_t max_bucket_size() const { return m_max_bucket_size; }
uint64_t max_sparse_buckets_per_size() const { return m_max_sparse_buckets_per_size; }

void print_full() const {
std::cout << "=== bucket statistics (full) === \n";
Expand Down Expand Up @@ -138,6 +146,9 @@ struct buckets_statistics {
if (rhs.max_bucket_size() > m_max_bucket_size) {
m_max_bucket_size = rhs.max_bucket_size();
}
if (rhs.max_sparse_buckets_per_size() > m_max_sparse_buckets_per_size) {
m_max_sparse_buckets_per_size = rhs.max_sparse_buckets_per_size();
}

assert(m_bucket_sizes.size() == rhs.m_bucket_sizes.size());
for (uint64_t i = 0; i != m_bucket_sizes.size(); ++i) {
Expand All @@ -160,6 +171,7 @@ struct buckets_statistics {

uint64_t m_max_num_kmers_in_super_kmer;
uint64_t m_max_bucket_size;
uint64_t m_max_sparse_buckets_per_size;

std::vector<uint64_t> m_bucket_sizes;
std::vector<uint64_t> m_total_kmers;
Expand Down
65 changes: 42 additions & 23 deletions include/builder/build_sparse_and_skew_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,6 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
const uint64_t min_size = 1ULL << constants::min_l;
const uint64_t num_bits_per_offset = strings_offsets_builder.num_bits_per_offset();

if (build_config.verbose) {
std::cout << "num_bits_per_offset = " << num_bits_per_offset << std::endl;
}

bits::compact_vector::builder control_codewords_builder;
control_codewords_builder.resize(num_minimizers, num_bits_per_offset + 1);

mm::file_source<minimizer_tuple> input(minimizers.get_minimizers_filename(),
mm::advice::sequential);

Expand All @@ -33,11 +26,10 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
uint64_t num_minimizer_positions_of_buckets_larger_than_1 = 0;
uint64_t num_minimizer_positions_of_buckets_in_skew_index = 0;

// First pass: collect bucket statistics to compute tighter bound
for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); //
it.has_next(); it.next()) //
{
const uint64_t bucket_id = it.minimizer();
assert(bucket_id < num_minimizers);
auto bucket = it.bucket();
const uint64_t bucket_size = bucket.size();
buckets_stats.add_bucket_size(bucket_size);
Expand All @@ -53,24 +45,31 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
num_super_kmers_in_buckets_larger_than_1 += bucket.num_super_kmers();
}

uint64_t prev_pos_in_seq = constants::invalid_uint64;
for (auto mt : bucket) {
if (bucket_size == 1 and mt.pos_in_seq != prev_pos_in_seq) {
/*
For minimizers occurring once, store a (log(N)+1)-bit
code, as follows: |offset|0|, i.e., the LSB is 0.
*/
uint64_t code = mt.pos_in_seq << 1; // first LS bit encodes status code: 0
assert(code < (uint64_t(1) << (num_bits_per_offset + 1)));
control_codewords_builder.set(bucket_id, code);
prev_pos_in_seq = mt.pos_in_seq;
}
buckets_stats.add_num_kmers_in_super_kmer(bucket_size, mt.num_kmers_in_super_kmer);
}
}

assert(buckets_stats.num_buckets() == num_minimizers);

// Calculate bits needed for control codewords encoding:
// Encoding format: ((list_id << min_l) | (bucket_size - 2)) << 2 | status_code
// We need: 2 bits (status) + min_l bits (bucket_size) + bits for list_id
// list_id is bounded by the maximum number of buckets sharing the same size
const uint64_t bits_for_list_id = std::ceil(std::log2(buckets_stats.max_sparse_buckets_per_size() + 1));
const uint64_t num_bits_for_control = std::max(num_bits_per_offset + 1,
2 + constants::min_l + bits_for_list_id);

if (build_config.verbose) {
std::cout << "num_bits_per_offset = " << num_bits_per_offset << std::endl;
std::cout << "max_list_id = " << buckets_stats.max_sparse_buckets_per_size() << std::endl;
std::cout << "bits_for_list_id = " << bits_for_list_id << std::endl;
std::cout << "num_bits_for_control = " << num_bits_for_control << std::endl;
}

bits::compact_vector::builder control_codewords_builder;
control_codewords_builder.resize(num_minimizers, num_bits_for_control);

strings_offsets_builder.build(d.m_spss.strings_offsets);
strings_builder.build(d.m_spss.strings);

Expand Down Expand Up @@ -100,11 +99,31 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
std::vector<minimizer_tuple> tuples; // backed memory
tuples.reserve(num_super_kmers_in_buckets_larger_than_1);

// Second pass: collect buckets > 1 for sorting AND handle size-1 buckets
for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); //
it.has_next(); it.next()) //
{
const uint64_t bucket_id = it.minimizer();
auto bucket = it.bucket();
if (bucket.size() > 1) {
const uint64_t bucket_size = bucket.size();

if (bucket_size == 1) {
// Handle size-1 buckets: encode directly into control codewords
uint64_t prev_pos_in_seq = constants::invalid_uint64;
for (auto mt : bucket) {
if (mt.pos_in_seq != prev_pos_in_seq) {
/*
For minimizers occurring once, store a (log(N)+1)-bit
code, as follows: |offset|0|, i.e., the LSB is 0.
*/
uint64_t code = mt.pos_in_seq << 1; // first LS bit encodes status code: 0
assert(code < (uint64_t(1) << num_bits_for_control));
control_codewords_builder.set(bucket_id, code);
prev_pos_in_seq = mt.pos_in_seq;
}
}
} else {
// Collect buckets > 1 for later processing
minimizer_tuple const* begin = tuples.data() + tuples.size();
std::copy(bucket.begin_ptr(), bucket.end_ptr(), std::back_inserter(tuples));
minimizer_tuple const* end = tuples.data() + tuples.size();
Expand Down Expand Up @@ -187,7 +206,7 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
if (prev_pos_in_seq == constants::invalid_uint64) { // only once
uint64_t p = (list_id << constants::min_l) | (curr_bucket_size - 2);
uint64_t code = (p << 2) | 1; // first two LS bits encode status code: 01
assert(code < (uint64_t(1) << (num_bits_per_offset + 1)));
assert(code < (uint64_t(1) << num_bits_for_control));
control_codewords_builder.set(mt.minimizer, code);
}
if (mt.pos_in_seq != prev_pos_in_seq) {
Expand All @@ -204,7 +223,7 @@ void dictionary_builder<Kmer, Offsets>::build_sparse_and_skew_index(
assert(partition_id < 8);
uint64_t p = (heavy_load_buckets_size << 3) | partition_id;
uint64_t code = (p << 2) | 3; // first two LS bits encode status code: 11
assert(code < (uint64_t(1) << (num_bits_per_offset + 1)));
assert(code < (uint64_t(1) << num_bits_for_control));
control_codewords_builder.set(mt.minimizer, code);
}
if (mt.pos_in_seq != prev_pos_in_seq) {
Expand Down
2 changes: 1 addition & 1 deletion include/builder/dictionary_builder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ struct dictionary_builder //
build_stats.add("index_size_in_bytes", (d.num_bits() + 7) / 8);
build_stats.add("num_kmers", d.num_kmers());

build_stats.print();
if (build_config.verbose) build_stats.print();
}

build_configuration build_config;
Expand Down
6 changes: 6 additions & 0 deletions include/dictionary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ struct dictionary //
/* Return the string of the kmer whose id is kmer_id. */
void access(uint64_t kmer_id, char* string_kmer) const;

/* Accessor for internal bit vector */
bits::bit_vector const& strings() const { return m_spss.strings; }

/* Membership queries. */
bool is_member(char const* string_kmer, bool check_reverse_complement = true) const;
bool is_member(Kmer uint_kmer, bool check_reverse_complement = true) const;
Expand Down Expand Up @@ -104,6 +107,9 @@ struct dictionary //
return m_spss.string_offsets(string_id);
}

/* Accessor for internal offsets structure */
Offsets const& strings_offsets() const { return m_spss.strings_offsets; }

iterator at_string_id(const uint64_t string_id) const {
assert(string_id < num_strings());
auto [begin, end] = string_offsets(string_id);
Expand Down
6 changes: 3 additions & 3 deletions include/hash_util.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#pragma once

#include "external/pthash/include/pthash.hpp"
#include "external/cityhash/cityhash.cpp"
#include "external/cityhash/cityhash.hpp"
#include "constants.hpp"

namespace sshash {
Expand All @@ -10,7 +10,7 @@ struct minimizers_city_hasher_128 {
typedef pthash::hash128 hash_type;

static inline pthash::hash128 hash(uint64_t const minimizer, uint64_t seed) {
auto ret = CityMurmur(reinterpret_cast<char const*>(&minimizer), //
auto ret = cityhash::CityHash128WithSeed(reinterpret_cast<char const*>(&minimizer), //
sizeof(minimizer), {seed, ~seed});
return {ret.first, ret.second};
}
Expand Down Expand Up @@ -60,7 +60,7 @@ struct kmers_city_hasher_128 {
typedef pthash::hash128 hash_type;

static inline pthash::hash128 hash(Kmer const x, uint64_t seed) {
auto ret = CityMurmur(reinterpret_cast<char const*>(&(x.bits)), //
auto ret = cityhash::CityHash128WithSeed(reinterpret_cast<char const*>(&(x.bits)), //
sizeof(x.bits), {seed, ~seed});
return {ret.first, ret.second};
}
Expand Down
2 changes: 1 addition & 1 deletion tools/query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ int query(int argc, char** argv) {
query_stats.add("num_invalid_kmers", report.num_invalid_kmers);
query_stats.add("num_searches", report.num_searches);
query_stats.add("num_extensions", report.num_extensions);
query_stats.add("elapsed_millisec", uint64(t.elapsed()));
query_stats.add("elapsed_millisec", uint64_t(t.elapsed()));

std::cout << "==== query report:\n";
std::cout << "num_kmers = " << report.num_kmers << std::endl;
Expand Down
Loading