Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
include LICENSE
include README.md
include CHANGELOG.md
recursive-include cppbktree/ *.cpp
recursive-include cppbktree/ *.hpp
recursive-include cppbktree/ *.pyx
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![PyPI version](https://badge.fury.io/py/cppbktree.svg)](https://badge.fury.io/py/cppbktree)
[![Downloads](https://pepy.tech/badge/cppbktree/month)](https://pepy.tech/project/cppbktree/month)
[![License](https://img.shields.io/badge/license-MIT-blue.svg)](http://opensource.org/licenses/MIT)
[![Build Status](https://travis-ci.org/mxmlnkn/cppbktree.svg?branch=master)](https://travis-ci.com/mxmlnkn/cppbktree)
[![C++ Code Checks](https://github.com/mxmlnkn/cppbktree/actions/workflows/test-cpp.yml/badge.svg)](https://github.com/mxmlnkn/cppbktree/actions/workflows/test-cpp.yml)

This module provides a BK-Tree class written in C++ to hopefully better speed than pure-python solutions.

Expand Down Expand Up @@ -43,7 +43,7 @@ This is done five times to get a hint for the standard deviation as is plotted w

## Comparison pybktree vs. cppbktree 0.1.0 with 32 K element chunking for 64-bit elements

![Comparison pybktree cppbktree chunked](benchmark/results/compare-scalings-pybktree-cppbktree-chunked-8K.png)
![Comparison pybktree cppbktree chunked](https://github.com/mxmlnkn/cppbktree/blob/master/benchmark/results/compare-scalings-pybktree-cppbktree-chunked-8K.png)

This is a benchmark for the specialized `BKTree64`, which operates on 64-bit values.
Doing the same with `BKTree` and byte vectors of size 8 to represent 64-bit values is roughly 4x slower
Expand All @@ -63,7 +63,7 @@ A more intricate version could simply concatenate the element vectors into one c

## Scaling Analysis

![Comparison pybktree cppbktree](benchmark/results/compare-scalings-pybktree-cppbktree.png)
![Comparison pybktree cppbktree](https://github.com/mxmlnkn/cppbktree/blob/master/benchmark/results/compare-scalings-pybktree-cppbktree.png)

In this log-log plot, it can be seen that the lookups and creations follow various sublinear power laws.
Inserting one element in a tree of depth $d$ should roughly take `O(log(d))` hamming distance evaluations.
Expand Down Expand Up @@ -113,7 +113,7 @@ Only the tree creation time speedup is quite independent of the tree size at rou
## Comparison pybktree vs. vptree


![Comparison pybktree cppbktree](benchmark/results/compare-scalings-pybktree-vptree.png)
![Comparison pybktree cppbktree](https://github.com/mxmlnkn/cppbktree/blob/master/benchmark/results/compare-scalings-pybktree-vptree.png)

At least in this benchmark with only 64-bit hashes and a hamming distance as metric and at least with this pure python implementation of a VP-Tree, the results are quite disappointing.
The vptree module is almost always slower.
Expand All @@ -123,7 +123,7 @@ For the 100k elements, this results in pybktree being 7.7 times faster than vptr

## Comparison linear lookup vs. cppbktree

![Comparison pybktree cppbktree](benchmark/results/compare-scalings-cppbktree-linear-lookup.png)
![Comparison pybktree cppbktree](https://github.com/mxmlnkn/cppbktree/blob/master/benchmark/results/compare-scalings-cppbktree-linear-lookup.png)

This log-log comparison plot shows that a simple linear lookup can compete with a BK tree.
For exact and almost exact lookups, the BK tree can become faster but even for a distance of 2,
Expand Down
10 changes: 10 additions & 0 deletions cppbktree/cppbktree.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,14 @@ hammingDistance64( const uint64_t a,
}


inline size_t
hammingDistance64s( const int64_t a,
const int64_t b )
{
return countBits( a ^ b );
}


template<typename T_ValueType,
typename T_DistanceType>
class CppBKTree
Expand Down Expand Up @@ -311,6 +319,8 @@ class CppBKTree
if ( !m_metricFunction ) {
if constexpr ( std::is_same_v<ValueType, uint64_t> ) {
m_metricFunction = MetricFunction( &hammingDistance64 );
} else if constexpr ( std::is_same_v<ValueType, int64_t> ) {
m_metricFunction = MetricFunction( &hammingDistance64s );
} else if constexpr ( std::is_same_v<ValueType, std::vector<uint8_t> > ) {
m_metricFunction = MetricFunction( &hammingDistance );
} else {
Expand Down
100 changes: 99 additions & 1 deletion cppbktree/cppbktree.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from libc.stdlib cimport malloc, free
from libc.stdio cimport SEEK_SET
from libc.stdint cimport uint8_t, uint64_t
from libc.stdint cimport uint8_t, uint64_t, int64_t
from libcpp.map cimport map
from libcpp.vector cimport vector
from libcpp.string cimport string
Expand Down Expand Up @@ -36,6 +36,23 @@ cdef class _LinearLookup64:
def size(self):
return self.data.size()


cdef class _LinearLookup64s:
cdef CppLinearLookup[int64_t]* data

def __cinit__(self, list_of_hashes):
self.data = new CppLinearLookup[int64_t](list_of_hashes)

def __dealloc__(self):
del self.data

def find(self, query, distance=0):
return <list>(self.data.find(query, distance))

def size(self):
return self.data.size()


# Extra class because cdefs are not visible from outside
class LinearLookup64:
def __init__(self, list_of_hashes):
Expand All @@ -48,9 +65,21 @@ class LinearLookup64:
return self.tree.size()


class LinearLookup64s:
def __init__(self, list_of_hashes):
self.tree = _LinearLookup64s(list_of_hashes)

def find(self, query, distance=0):
return self.tree.find(query, distance)

def size(self):
return self.tree.size()


cdef extern from "cppbktree.hpp":
size_t hammingDistance(const vector[uint8_t]&, const vector[uint8_t]& ) except +
size_t hammingDistance64( const uint64_t, const uint64_t ) except +;
size_t hammingDistance64s( const int64_t, const int64_t ) except +;

cppclass CppBKTree[T_ValueType, T_DistanceType]:
struct TreeStatistics:
Expand Down Expand Up @@ -174,6 +203,55 @@ cdef class _BKTree64:
return self.tree.rebalance(<size_t>max_element_count)


cdef class _BKTree64s:
cdef CppBKTree[int64_t, size_t]* tree
cdef int max_element_count
cdef bool _needs_rebalance

def __cinit__(self, list_of_hashes_or_file_name, max_element_count = 32 * 1024):
self.tree = new CppBKTree[int64_t, size_t](list_of_hashes_or_file_name)
self.tree.rebalance(max_element_count)
self.max_element_count = max_element_count
self._needs_rebalance = False

def __dealloc__(self):
del self.tree

def add(self, list_of_hashes_or_file_name):
self.tree.add(<vector[int64_t]>list_of_hashes_or_file_name)
self._needs_rebalance = True

def find(self, query, distance=0):
if self._needs_rebalance:
self.rebalance()
self._needs_rebalance = False
return <list>(self.tree.find(query, distance))

def size(self):
return self.tree.size()

def statistics(self):
# Automatic POD to dict conversion did not work for me. Maybe because the contained types?
cdef CppBKTree[int64_t, size_t].TreeStatistics result = self.tree.statistics();
stats = {
'nodeCount' : result.nodeCount ,
'leafCount' : result.leafCount ,
'valueCount' : result.valueCount ,
'averageChildCountPerNode' : result.averageChildCountPerNode,
'maxDepth' : result.maxDepth ,
'minChildrenPerNode' : result.minChildrenPerNode ,
'maxChildrenPerNode' : result.maxChildrenPerNode ,
'minPayloadsPerNode' : result.minPayloadsPerNode ,
'maxPayloadsPerNode' : result.maxPayloadsPerNode ,
'duplicateCount' : result.duplicateCount ,
'valueBitCount' : result.valueBitCount ,
}
return stats

def rebalance(self, max_element_count = None):
return self.tree.rebalance(self.max_element_count if max_element_count is None else <size_t>max_element_count)


# Extra class because cdefs are not visible from outside
class BKTree:
def __init__(self, list_of_hashes):
Expand Down Expand Up @@ -215,4 +293,24 @@ class BKTree64:
return self.tree.rebalance(max_element_count)


class BKTree64s:
def __init__(self, list_of_hashes, max_element_count = 32 * 1024):
self.tree = _BKTree64s(list_of_hashes, max_element_count)

def add(self, list_of_hashes_or_file_name):
self.tree.add(list_of_hashes_or_file_name)

def find(self, query, distance=0):
return self.tree.find(query, distance)

def size(self):
return self.tree.size()

def statistics(self):
return self.tree.statistics()

def rebalance(self, max_element_count = None):
return self.tree.rebalance(max_element_count)


__version__ = '0.2.0'