From f12aadc0f673330ae4317c045b963942ae06e972 Mon Sep 17 00:00:00 2001 From: Maximilian Knespel Date: Mon, 17 Nov 2025 17:45:55 +0100 Subject: [PATCH 1/4] [doc] Replace ancient Travis CI badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f62d2dc..b32d74e 100755 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![PyPI version](https://badge.fury.io/py/cppbktree.svg)](https://badge.fury.io/py/cppbktree) [![Downloads](https://pepy.tech/badge/cppbktree/month)](https://pepy.tech/project/cppbktree/month) [![License](https://img.shields.io/badge/license-MIT-blue.svg)](http://opensource.org/licenses/MIT) -[![Build Status](https://travis-ci.org/mxmlnkn/cppbktree.svg?branch=master)](https://travis-ci.com/mxmlnkn/cppbktree) +[![C++ Code Checks](https://github.com/mxmlnkn/cppbktree/actions/workflows/test-cpp.yml/badge.svg)](https://github.com/mxmlnkn/cppbktree/actions/workflows/test-cpp.yml) This module provides a BK-Tree class written in C++ to hopefully better speed than pure-python solutions. From 1beff6ca5b36868be6178e701bb32bd0ffc519ce Mon Sep 17 00:00:00 2001 From: Maximilian Knespel Date: Mon, 17 Nov 2025 17:53:22 +0100 Subject: [PATCH 2/4] [build] Add missing MANIFEST.in --- MANIFEST.in | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..e3f0772 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,6 @@ +include LICENSE +include README.md +include CHANGELOG.md +recursive-include cppbktree/ *.cpp +recursive-include cppbktree/ *.hpp +recursive-include cppbktree/ *.pyx From 5ccadc44f6bccedff074ef66081a4afcaa57d588 Mon Sep 17 00:00:00 2001 From: Maximilian Knespel Date: Mon, 17 Nov 2025 17:57:24 +0100 Subject: [PATCH 3/4] [doc] Make images work on PyPI --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b32d74e..5ed1954 100755 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ This is done five times to get a hint for the standard deviation as is plotted w ## Comparison pybktree vs. cppbktree 0.1.0 with 32 K element chunking for 64-bit elements -![Comparison pybktree cppbktree chunked](benchmark/results/compare-scalings-pybktree-cppbktree-chunked-8K.png) +![Comparison pybktree cppbktree chunked](https://github.com/mxmlnkn/cppbktree/blob/master/benchmark/results/compare-scalings-pybktree-cppbktree-chunked-8K.png) This is a benchmark for the specialized `BKTree64`, which operates on 64-bit values. Doing the same with `BKTree` and byte vectors of size 8 to represent 64-bit values is roughly 4x slower @@ -63,7 +63,7 @@ A more intricate version could simply concatenate the element vectors into one c ## Scaling Analysis -![Comparison pybktree cppbktree](benchmark/results/compare-scalings-pybktree-cppbktree.png) +![Comparison pybktree cppbktree](https://github.com/mxmlnkn/cppbktree/blob/master/benchmark/results/compare-scalings-pybktree-cppbktree.png) In this log-log plot, it can be seen that the lookups and creations follow various sublinear power laws. Inserting one element in a tree of depth $d$ should roughly take `O(log(d))` hamming distance evaluations. @@ -113,7 +113,7 @@ Only the tree creation time speedup is quite independent of the tree size at rou ## Comparison pybktree vs. vptree -![Comparison pybktree cppbktree](benchmark/results/compare-scalings-pybktree-vptree.png) +![Comparison pybktree cppbktree](https://github.com/mxmlnkn/cppbktree/blob/master/benchmark/results/compare-scalings-pybktree-vptree.png) At least in this benchmark with only 64-bit hashes and a hamming distance as metric and at least with this pure python implementation of a VP-Tree, the results are quite disappointing. The vptree module is almost always slower. @@ -123,7 +123,7 @@ For the 100k elements, this results in pybktree being 7.7 times faster than vptr ## Comparison linear lookup vs. cppbktree -![Comparison pybktree cppbktree](benchmark/results/compare-scalings-cppbktree-linear-lookup.png) +![Comparison pybktree cppbktree](https://github.com/mxmlnkn/cppbktree/blob/master/benchmark/results/compare-scalings-cppbktree-linear-lookup.png) This log-log comparison plot shows that a simple linear lookup can compete with a BK tree. For exact and almost exact lookups, the BK tree can become faster but even for a distance of 2, From 22837359c9fec3f296fbf23eaf8ee12c3f532ba0 Mon Sep 17 00:00:00 2001 From: Alec Elton Date: Mon, 10 Nov 2025 06:10:30 -0700 Subject: [PATCH 4/4] Support for signed integer values --- cppbktree/cppbktree.hpp | 10 ++++ cppbktree/cppbktree.pyx | 100 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 109 insertions(+), 1 deletion(-) diff --git a/cppbktree/cppbktree.hpp b/cppbktree/cppbktree.hpp index 471637b..76a56a6 100755 --- a/cppbktree/cppbktree.hpp +++ b/cppbktree/cppbktree.hpp @@ -70,6 +70,14 @@ hammingDistance64( const uint64_t a, } +inline size_t +hammingDistance64s( const int64_t a, + const int64_t b ) +{ + return countBits( a ^ b ); +} + + template class CppBKTree @@ -311,6 +319,8 @@ class CppBKTree if ( !m_metricFunction ) { if constexpr ( std::is_same_v ) { m_metricFunction = MetricFunction( &hammingDistance64 ); + } else if constexpr ( std::is_same_v ) { + m_metricFunction = MetricFunction( &hammingDistance64s ); } else if constexpr ( std::is_same_v > ) { m_metricFunction = MetricFunction( &hammingDistance ); } else { diff --git a/cppbktree/cppbktree.pyx b/cppbktree/cppbktree.pyx index 59d2e47..8507cbe 100755 --- a/cppbktree/cppbktree.pyx +++ b/cppbktree/cppbktree.pyx @@ -2,7 +2,7 @@ from libc.stdlib cimport malloc, free from libc.stdio cimport SEEK_SET -from libc.stdint cimport uint8_t, uint64_t +from libc.stdint cimport uint8_t, uint64_t, int64_t from libcpp.map cimport map from libcpp.vector cimport vector from libcpp.string cimport string @@ -36,6 +36,23 @@ cdef class _LinearLookup64: def size(self): return self.data.size() + +cdef class _LinearLookup64s: + cdef CppLinearLookup[int64_t]* data + + def __cinit__(self, list_of_hashes): + self.data = new CppLinearLookup[int64_t](list_of_hashes) + + def __dealloc__(self): + del self.data + + def find(self, query, distance=0): + return (self.data.find(query, distance)) + + def size(self): + return self.data.size() + + # Extra class because cdefs are not visible from outside class LinearLookup64: def __init__(self, list_of_hashes): @@ -48,9 +65,21 @@ class LinearLookup64: return self.tree.size() +class LinearLookup64s: + def __init__(self, list_of_hashes): + self.tree = _LinearLookup64s(list_of_hashes) + + def find(self, query, distance=0): + return self.tree.find(query, distance) + + def size(self): + return self.tree.size() + + cdef extern from "cppbktree.hpp": size_t hammingDistance(const vector[uint8_t]&, const vector[uint8_t]& ) except + size_t hammingDistance64( const uint64_t, const uint64_t ) except +; + size_t hammingDistance64s( const int64_t, const int64_t ) except +; cppclass CppBKTree[T_ValueType, T_DistanceType]: struct TreeStatistics: @@ -174,6 +203,55 @@ cdef class _BKTree64: return self.tree.rebalance(max_element_count) +cdef class _BKTree64s: + cdef CppBKTree[int64_t, size_t]* tree + cdef int max_element_count + cdef bool _needs_rebalance + + def __cinit__(self, list_of_hashes_or_file_name, max_element_count = 32 * 1024): + self.tree = new CppBKTree[int64_t, size_t](list_of_hashes_or_file_name) + self.tree.rebalance(max_element_count) + self.max_element_count = max_element_count + self._needs_rebalance = False + + def __dealloc__(self): + del self.tree + + def add(self, list_of_hashes_or_file_name): + self.tree.add(list_of_hashes_or_file_name) + self._needs_rebalance = True + + def find(self, query, distance=0): + if self._needs_rebalance: + self.rebalance() + self._needs_rebalance = False + return (self.tree.find(query, distance)) + + def size(self): + return self.tree.size() + + def statistics(self): + # Automatic POD to dict conversion did not work for me. Maybe because the contained types? + cdef CppBKTree[int64_t, size_t].TreeStatistics result = self.tree.statistics(); + stats = { + 'nodeCount' : result.nodeCount , + 'leafCount' : result.leafCount , + 'valueCount' : result.valueCount , + 'averageChildCountPerNode' : result.averageChildCountPerNode, + 'maxDepth' : result.maxDepth , + 'minChildrenPerNode' : result.minChildrenPerNode , + 'maxChildrenPerNode' : result.maxChildrenPerNode , + 'minPayloadsPerNode' : result.minPayloadsPerNode , + 'maxPayloadsPerNode' : result.maxPayloadsPerNode , + 'duplicateCount' : result.duplicateCount , + 'valueBitCount' : result.valueBitCount , + } + return stats + + def rebalance(self, max_element_count = None): + return self.tree.rebalance(self.max_element_count if max_element_count is None else max_element_count) + + # Extra class because cdefs are not visible from outside class BKTree: def __init__(self, list_of_hashes): @@ -215,4 +293,24 @@ class BKTree64: return self.tree.rebalance(max_element_count) +class BKTree64s: + def __init__(self, list_of_hashes, max_element_count = 32 * 1024): + self.tree = _BKTree64s(list_of_hashes, max_element_count) + + def add(self, list_of_hashes_or_file_name): + self.tree.add(list_of_hashes_or_file_name) + + def find(self, query, distance=0): + return self.tree.find(query, distance) + + def size(self): + return self.tree.size() + + def statistics(self): + return self.tree.statistics() + + def rebalance(self, max_element_count = None): + return self.tree.rebalance(max_element_count) + + __version__ = '0.2.0'