jermp · jermp · Mar 9, 2026 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/.clang-format b/.clang-format
@@ -148,5 +148,3 @@ StatementMacros:
 TabWidth:        8
 UseTab:          Never
 ...
-
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,9 +8,9 @@ endif ()
 
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
 
-if (UNIX AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64"))
+if (UNIX AND (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64"))
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mbmi2 -mavx2")
-  if (SSHASH_USE_ARCH_NATIVE)
+  if (SSHASH_USE_ARCH_NATIVE AND NOT CMAKE_CROSSCOMPILING)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
   endif()
 endif()
@@ -33,7 +33,7 @@ if (UNIX)
 
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-missing-braces -Wno-unknown-attributes -Wno-unused-function")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror -Wno-missing-braces -Wno-unknown-attributes -Wno-unused-function")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
 
   if (SSHASH_USE_SANITIZERS)
@@ -48,58 +48,73 @@ else()
   set(CONDA_BUILD FALSE)
 endif()
 
+option(SSHASH_BUILD_EXECUTABLES "Build sshash executables" ON)
 MESSAGE(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 MESSAGE(STATUS "Conda build: ${CONDA_BUILD}")
 MESSAGE(STATUS "Installation prefix: ${CMAKE_INSTALL_PREFIX}")
-MESSAGE(STATUS "Compiling for processor: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+MESSAGE(STATUS "Compiling for processor: ${CMAKE_SYSTEM_PROCESSOR}")
 MESSAGE(STATUS "Compiling with flags:${CMAKE_CXX_FLAGS}")
-
-include_directories(.) # all include paths relative to parent directory
-include_directories(external/pthash/include)
-include_directories(external/pthash/external/bits/include)
-include_directories(external/pthash/external/fastmod)
-include_directories(external/pthash/external/bits/external/essentials/include)
-include_directories(external/pthash/external/xxHash)
-include_directories(external/pthash/external/mm_file/include)
+MESSAGE(STATUS "Build executables: ${SSHASH_BUILD_EXECUTABLES}")
 
 set(Z_LIB_SOURCES
   external/gz/zip_stream.cpp
 )
 
+set(CITYHASH_SOURCES
+  external/cityhash/cityhash.cpp
+)
+
 set(SSHASH_SOURCES
   src/build.cpp
   src/dictionary.cpp
   src/query.cpp
   src/info.cpp
-  src/statistics.cpp
+)
+
+set(SSHASH_INCLUDE_DIRS
+  external/pthash/include
+  external/pthash/external/bits/include
+  external/pthash/external/fastmod
+  external/pthash/external/bits/external/essentials/include
+  external/pthash/external/xxHash
+  external/pthash/external/mm_file/include
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR}/include
 )
 
 # Create a static lib
 add_library(sshash_static STATIC
   ${Z_LIB_SOURCES}
+  ${CITYHASH_SOURCES}
   ${SSHASH_SOURCES}
 )
 
-add_executable(sshash tools/sshash.cpp)
-target_link_libraries(sshash
-  z
-)
+target_include_directories(sshash_static PUBLIC ${SSHASH_INCLUDE_DIRS})
 
-# tests:
+if(SSHASH_BUILD_EXECUTABLES)
+  add_executable(sshash tools/sshash.cpp)
+  target_include_directories(sshash PUBLIC ${SSHASH_INCLUDE_DIRS})
+  target_link_libraries(sshash
+    sshash_static
+    z
+  )
 
-add_executable(test_alphabet test/test_alphabet.cpp)
-target_link_libraries(test_alphabet
-  sshash_static
-)
+  # tests:
 
-add_executable(check test/check.cpp)
-target_link_libraries(check
-  sshash_static
-  z
-)
+  add_executable(test_alphabet test/test_alphabet.cpp)
+  target_link_libraries(test_alphabet
+    sshash_static
+  )
 
-if (CONDA_BUILD)
-  install(TARGETS sshash
-    RUNTIME DESTINATION bin
+  add_executable(check test/check.cpp)
+  target_link_libraries(check
+    sshash_static
+    z
   )
+
+  if (CONDA_BUILD)
+    install(TARGETS sshash
+      RUNTIME DESTINATION bin
+    )
+  endif()
 endif()
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright 2021-2025 Giulio Ermanno Pibiri, Oleksandr Kulkov, and COMBINE Lab
+Copyright 2021-2026 Giulio Ermanno Pibiri, Oleksandr Kulkov, and COMBINE Lab
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -1,34 +1,29 @@
-[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7239205.svg)](https://doi.org/10.5281/zenodo.7239205)
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.17582116.svg)](https://doi.org/10.5281/zenodo.17582116)
 
 Benchmarks
 ----------
 
-For these benchmarks we used the whole genomes of the following organisms:
+For these benchmarks we used the datasets available here
+[https://zenodo.org/records/17582116](https://zenodo.org/records/17582116).
 
-- Gadus Morhua ("Cod")
-- Falco Tinnunculus ("Kestrel")
-- Homo Sapiens ("Human")
-
-for k = 31 and 63.
+To run the benchmarks, from within the `build` directory, run
 
-The datasets and queries used in these benchmarks can be downloaded
-by running the script
+    python3 ../script/build.py <log_label> <input_datasets_dir> <output_index_dir>
+    python3 ../script/bench.py <log_label> <input_index_dir>
+    python3 ../script/streaming-query-high-hit.py <log_label> <input_index_dir> <input_queries_dir>
 
-```
-bash download-datasets.sh
-```
+where `<log_label>` should be replaced by a suitable basename, e.g., the current date.
 
-To run the benchmarks, from within the `build` directory, run
+These are the results obtained on 21/01/26 (see logs [here](results-21-01-26))
+on a machine equipped with an AMD Ryzen Threadripper PRO 7985WX processor clocked at 5.40GHz.
+The code was compiled with `gcc` 13.3.0.
 
-```
-bash ../script/build.sh [prefix]
-bash ../script/bench.sh [prefix]
-bash ../script/streaming-query-high-hit.sh [prefix]
-bash ../script/streaming-query-low-hit.sh [prefix]
-```
+The indexes were build with a max RAM usage of 16 GB and 64 threads.
+Queries were run using one thread, instead.
 
-where `[prefix]` should be replaced by a suitable basename, e.g., the current date.
+![](results-21-01-26/results.png)
 
-These are the results obtained on 22/08/25 (see logs [here](results-22-08-25)).
+The results can be exported to CSV format with
 
-![](results-22-08-25/results.png)
+    python3 ../script/print_csv.py ../benchmarks/results-10-11-25/k31
+    python3 ../script/print_csv.py ../benchmarks/results-10-11-25/k63
diff --git a/benchmarks/download-datasets.sh b/benchmarks/download-datasets.sh
diff --git a/benchmarks/print_csv.py b/benchmarks/print_csv.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import os
+from statistics import mean, StatisticsError
+import math
+
+def format_time(microseconds):
+    seconds = microseconds / 1_000_000
+    minutes = int(seconds // 60)
+    seconds = int(seconds % 60)
+    return f"{minutes}:{seconds:02d}"
+
+def parse_build_file(path, canonical_flag):
+    """Parse build JSONL file."""
+    results = []
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                d = json.loads(line)
+            except json.JSONDecodeError:
+                print(f"Skipping invalid JSON line in {path}", file=sys.stderr)
+                continue
+
+            num_kmers = int(d["num_kmers"])
+            index_bytes = int(d["index_size_in_bytes"])
+            build_time_us = int(d["total_build_time_in_microsec"])
+
+            bits_per_kmer = (index_bytes * 8) / num_kmers
+            gb = index_bytes / 1e9
+            build_time_fmt = format_time(build_time_us)
+
+            fname = os.path.basename(d["input_filename"])
+            collection = fname.split(".")[0].capitalize()
+            k = d["k"]
+
+            results.append({
+                "k": k,
+                "Collection": collection,
+                "m": d["m"],
+                "canonical": "yes" if canonical_flag else "no",
+                "bits_per_kmer": f"{bits_per_kmer:.2f}",
+                "total_GB": f"{gb:.2f}",
+                "build_time": build_time_fmt
+            })
+    return results
+
+def parse_bench_file(path, canonical_flag):
+    """Parse benchmark JSONL file and average per collection."""
+    lookup_data = {}
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                d = json.loads(line)
+            except json.JSONDecodeError:
+                print(f"Skipping invalid JSON line in {path}", file=sys.stderr)
+                continue
+
+            fname = os.path.basename(d["index_filename"])
+            collection = fname.split(".")[0].capitalize()
+            m = d["m"]
+            k = d["k"]
+            canonical = "yes" if canonical_flag else "no"
+
+            key = (collection, m, canonical)
+            entry = lookup_data.setdefault(key, {
+                "k": k,
+                "pos": [], "neg": [], "access": [], "iter": []
+            })
+            entry["pos"].append(float(d["positive lookup (avg_nanosec_per_kmer)"]))
+            entry["neg"].append(float(d["negative lookup (avg_nanosec_per_kmer)"]))
+            entry["access"].append(float(d["access (avg_nanosec_per_kmer)"]))
+            entry["iter"].append(float(d["iterator (avg_nanosec_per_kmer)"]))
+
+    # average the results
+    for k, v in lookup_data.items():
+        try:
+            lookup_data[k] = {
+                "k": v["k"],
+                "pos": f"{mean(v['pos'])/1000:.2f}",
+                "neg": f"{mean(v['neg'])/1000:.2f}",
+                "access": f"{mean(v['access'])/1000:.2f}",
+                "iter": f"{mean(v['iter']):.2f}",
+            }
+        except StatisticsError:
+            lookup_data[k] = {"k": v["k"], "pos": "NA", "neg": "NA", "access": "NA", "iter": "NA"}
+    return lookup_data
+
+
+def parse_streaming_file(path, canonical_flag):
+    """Parse streaming queries JSON file."""
+    stream_data = {}
+    if not os.path.exists(path):
+        return stream_data
+
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                d = json.loads(line)
+            except json.JSONDecodeError:
+                print(f"Skipping invalid JSON line in {path}", file=sys.stderr)
+                continue
+
+            fname = os.path.basename(d["index_filename"])
+            collection = fname.split(".")[0].capitalize()
+            canonical = "yes" if canonical_flag else "no"
+
+            key = (collection, canonical)
+            num_kmers = int(d["num_kmers"])
+            num_pos = int(d["num_positive_kmers"])
+            num_ext = int(d["num_extensions"])
+            elapsed_ms = int(d["elapsed_millisec"])
+
+            ns_per_kmer = int(math.ceil(elapsed_ms * 1e6 / num_kmers))
+            hit_rate = (num_pos / num_kmers) * 100 if num_kmers else 0
+            extension_rate = (num_ext / num_pos) * 100 if num_pos else 0
+
+            stream_data[key] = {
+                "ns_per_kmer": f"{ns_per_kmer}",
+                "hit_rate": f"{hit_rate:.2f}",
+                "extension_rate": f"{extension_rate:.2f}"
+            }
+    return stream_data
+
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: print.py input_dir", file=sys.stderr)
+        sys.exit(1)
+
+    input_dir = sys.argv[1]
+    reg_build_path = input_dir + "/regular-build.json"
+    canon_build_path = input_dir + "/canon-build.json"
+    reg_bench_path = input_dir + "/regular-bench.json"
+    canon_bench_path = input_dir + "/canon-bench.json"
+    reg_stream_path = input_dir + "/regular-streaming-queries-high-hit.json"
+    canon_stream_path = input_dir + "/canon-streaming-queries-high-hit.json"
+
+    reg_build = parse_build_file(reg_build_path, False)
+    canon_build = parse_build_file(canon_build_path, True)
+    reg_bench = parse_bench_file(reg_bench_path, False)
+    canon_bench = parse_bench_file(canon_bench_path, True)
+    reg_stream = parse_streaming_file(reg_stream_path, False)
+    canon_stream = parse_streaming_file(canon_stream_path, True)
+
+    # merge everything
+    all_builds = reg_build + canon_build
+    lookup_all = {**reg_bench, **canon_bench}
+    stream_all = {**reg_stream, **canon_stream}
+
+    # CSV header
+    print("k,Collection,m,canonical,bits_per_kmer,total_GB,build_time,positive_lookup_ns,negative_lookup_ns,access_ns,iteration_ns,ns_per_kmer,hit_rate,extension_rate")
+
+    for r in sorted(all_builds, key=lambda x: (int(x["k"]), x["Collection"], x["canonical"])):
+        lookup = lookup_all.get(
+            (r["Collection"], r["m"], r["canonical"]), # key
+            {"pos": "NA", "neg": "NA", "access": "NA", "iter": "NA", "k": r["k"]})
+        stream = stream_all.get(
+            (r["Collection"], r["canonical"]), # key
+            {"ns_per_kmer": "NA", "hit_rate": "NA", "extension_rate": "NA"})
+
+        print(f"{r['k']},{r['Collection']},{r['m']},{r['canonical']},{r['bits_per_kmer']},{r['total_GB']},{r['build_time']},{lookup['pos']},{lookup['neg']},{lookup['access']},{lookup['iter']},{stream['ns_per_kmer']},{stream['hit_rate']},{stream['extension_rate']}")
+
+if __name__ == "__main__":
+    main()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -148,5 +148,3 @@ StatementMacros:
		TabWidth: 8
		UseTab: Never
		...