From 1f36f02c1484e7b86f9a80bfdf53cacec5514793 Mon Sep 17 00:00:00 2001 From: Krzysztof Czajkowski Date: Mon, 23 Jun 2025 15:14:47 +0200 Subject: [PATCH] Add speed related improvements --- editdistance/_edit_distance_osa.cpp | 92 ++++++++++++++++------------- editdistance/_edit_distance_osa.hpp | 35 +++++++---- editdistance/edit_distance_osa.pyx | 55 +++++------------ examples/benchmark.py | 61 +++++++++++++++++++ 4 files changed, 152 insertions(+), 91 deletions(-) create mode 100644 examples/benchmark.py diff --git a/editdistance/_edit_distance_osa.cpp b/editdistance/_edit_distance_osa.cpp index b24eca1..e1a6feb 100644 --- a/editdistance/_edit_distance_osa.cpp +++ b/editdistance/_edit_distance_osa.cpp @@ -4,33 +4,37 @@ std::vector> compute_dp_table( const std::string& a, - const std::string& b, - const std::map& cost_map + const std::string& b, + double replace_weight, + double insert_weight, + double delete_weight, + double swap_weight ) { int len_a = a.length(); int len_b = b.length(); std::vector> dp(len_a + 1, std::vector(len_b + 1, 0.0)); for (int i = 0; i <= len_a; ++i) { - dp[i][0] = i * cost_map.at(DELETE); + dp[i][0] = i * delete_weight; } for (int j = 0; j <= len_b; ++j) { - dp[0][j] = j * cost_map.at(INSERT); + dp[0][j] = j * insert_weight; } for (int i = 1; i <= len_a; ++i) { for (int j = 1; j <= len_b; ++j) { - double deletion = dp[i-1][j] + cost_map.at(DELETE); - double insertion = dp[i][j-1] + cost_map.at(INSERT); - double substitution_cost = (a[i-1] == b[j-1]) ? 0.0 : cost_map.at(REPLACE); - double substitution = dp[i-1][j-1] + substitution_cost; - + if (a[i-1] == b[j-1]) { + dp[i][j] = dp[i-1][j-1]; // match, no cost + continue; // skip swap and other ops, match is optimal + } + double deletion = dp[i-1][j] + delete_weight; + double insertion = dp[i][j-1] + insert_weight; + double substitution = dp[i-1][j-1] + replace_weight; dp[i][j] = std::min({deletion, insertion, substitution}); - if (i > 1 && j > 1 && a[i-1] == b[j-2] && a[i-2] == b[j-1]) { dp[i][j] = std::min(dp[i][j], - dp[i-2][j-2] + cost_map.at(SWAP)); + dp[i-2][j-2] + swap_weight); } } } @@ -38,24 +42,29 @@ std::vector> compute_dp_table( return dp; } - double cpp_compute_distance( const std::string& a, - const std::string& b, - const std::map& cost_map + const std::string& b, + double replace_weight, + double insert_weight, + double delete_weight, + double swap_weight ) { - auto dp = compute_dp_table(a, b, cost_map); + auto dp = compute_dp_table(a, b, replace_weight, insert_weight, delete_weight, swap_weight); return dp[a.length()][b.length()]; } std::vector> backtrack_all_paths( const std::string& a, const std::string& b, - const std::map& cost_map, const std::vector>& dp, int i, int j, - std::vector& current_path + std::vector& current_path, + double replace_weight, + double insert_weight, + double delete_weight, + double swap_weight ) { if (i == 0 && j == 0) { std::vector reversed_path = current_path; @@ -67,44 +76,41 @@ std::vector> backtrack_all_paths( double current_cost = dp[i][j]; const double tol = 1e-6; - - if (i > 0 && std::abs((dp[i-1][j] + cost_map.at(DELETE)) - current_cost) < tol) { - CppEditop op(DELETE, i-1, i-1, cost_map.at(DELETE), std::string(1, a[i-1])); + if (i > 0 && std::abs((dp[i-1][j] + delete_weight) - current_cost) < tol) { + CppEditop op(DELETE, i-1, i-1, delete_weight, std::string(1, a[i-1])); current_path.push_back(op); - auto paths = backtrack_all_paths(a, b, cost_map, dp, i-1, j, current_path); + auto paths = backtrack_all_paths(a, b, dp, i-1, j, current_path, replace_weight, insert_weight, delete_weight, swap_weight); all_paths.insert(all_paths.end(), paths.begin(), paths.end()); current_path.pop_back(); } - if (j > 0 && std::abs((dp[i][j-1] + cost_map.at(INSERT)) - current_cost) < tol) { - CppEditop op(INSERT, i, i, cost_map.at(INSERT), std::string(1, b[j-1])); + if (j > 0 && std::abs((dp[i][j-1] + insert_weight) - current_cost) < tol) { + CppEditop op(INSERT, i, i, insert_weight, std::string(1, b[j-1])); current_path.push_back(op); - auto paths = backtrack_all_paths(a, b, cost_map, dp, i, j-1, current_path); + auto paths = backtrack_all_paths(a, b, dp, i, j-1, current_path, replace_weight, insert_weight, delete_weight, swap_weight); all_paths.insert(all_paths.end(), paths.begin(), paths.end()); current_path.pop_back(); } - if (i > 0 && j > 0) { - double sub_cost = (a[i-1] == b[j-1]) ? 0.0 : cost_map.at(REPLACE); + double sub_cost = (a[i-1] == b[j-1]) ? 0.0 : replace_weight; if (std::abs((dp[i-1][j-1] + sub_cost) - current_cost) < tol) { std::string out_char = (sub_cost == 0.0) ? std::string(1, a[i-1]) : std::string(1, b[j-1]); CppEditop op(REPLACE, i-1, j-1, sub_cost, out_char); current_path.push_back(op); - auto paths = backtrack_all_paths(a, b, cost_map, dp, i-1, j-1, current_path); + auto paths = backtrack_all_paths(a, b, dp, i-1, j-1, current_path, replace_weight, insert_weight, delete_weight, swap_weight); all_paths.insert(all_paths.end(), paths.begin(), paths.end()); current_path.pop_back(); } } - if (i > 1 && j > 1 && a[i-1] == b[j-2] && a[i-2] == b[j-1] && - std::abs((dp[i-2][j-2] + cost_map.at(SWAP)) - current_cost) < tol) { + std::abs((dp[i-2][j-2] + swap_weight) - current_cost) < tol) { std::string swap_str = std::string(1, b[j-2]) + std::string(1, b[j-1]); - CppEditop op(SWAP, i-2, j-2, cost_map.at(SWAP), swap_str); + CppEditop op(SWAP, i-2, j-2, swap_weight, swap_str); current_path.push_back(op); - auto paths = backtrack_all_paths(a, b, cost_map, dp, i-2, j-2, current_path); + auto paths = backtrack_all_paths(a, b, dp, i-2, j-2, current_path, replace_weight, insert_weight, delete_weight, swap_weight); all_paths.insert(all_paths.end(), paths.begin(), paths.end()); current_path.pop_back(); } @@ -112,25 +118,29 @@ std::vector> backtrack_all_paths( return all_paths; } - std::vector> cpp_compute_all_paths( const std::string& a, - const std::string& b, - const std::map& cost_map + const std::string& b, + double replace_weight, + double insert_weight, + double delete_weight, + double swap_weight ) { - auto dp = compute_dp_table(a, b, cost_map); + auto dp = compute_dp_table(a, b, replace_weight, insert_weight, delete_weight, swap_weight); std::vector current_path; - return backtrack_all_paths(a, b, cost_map, dp, a.length(), b.length(), current_path); + return backtrack_all_paths(a, b, dp, a.length(), b.length(), current_path, replace_weight, insert_weight, delete_weight, swap_weight); } - void cpp_print_all_paths( const std::string& a, - const std::string& b, - const std::map& cost_map + const std::string& b, + double replace_weight, + double insert_weight, + double delete_weight, + double swap_weight ) { - auto paths = cpp_compute_all_paths(a, b, cost_map); - double distance = cpp_compute_distance(a, b, cost_map); + auto paths = cpp_compute_all_paths(a, b, replace_weight, insert_weight, delete_weight, swap_weight); + double distance = cpp_compute_distance(a, b, replace_weight, insert_weight, delete_weight, swap_weight); std::cout << "OSA Distance from '" << a << "' to '" << b << "': " << distance << std::endl; std::cout << "Number of optimal edit sequences: " << paths.size() << std::endl; diff --git a/editdistance/_edit_distance_osa.hpp b/editdistance/_edit_distance_osa.hpp index 2c82ff2..b97d131 100644 --- a/editdistance/_edit_distance_osa.hpp +++ b/editdistance/_edit_distance_osa.hpp @@ -29,40 +29,55 @@ struct CppEditop { std::vector> compute_dp_table( const std::string& a, - const std::string& b, - const std::map& cost_map + const std::string& b, + double replace_weight, + double insert_weight, + double delete_weight, + double swap_weight ); double cpp_compute_distance( const std::string& a, - const std::string& b, - const std::map& cost_map + const std::string& b, + double replace_weight, + double insert_weight, + double delete_weight, + double swap_weight ); std::vector> backtrack_all_paths( const std::string& a, const std::string& b, - const std::map& cost_map, const std::vector>& dp, int i, int j, - std::vector& current_path + std::vector& current_path, + double replace_weight, + double insert_weight, + double delete_weight, + double swap_weight ); std::vector> cpp_compute_all_paths( const std::string& a, - const std::string& b, - const std::map& cost_map + const std::string& b, + double replace_weight, + double insert_weight, + double delete_weight, + double swap_weight ); void cpp_print_all_paths( const std::string& a, - const std::string& b, - const std::map& cost_map + const std::string& b, + double replace_weight, + double insert_weight, + double delete_weight, + double swap_weight ); diff --git a/editdistance/edit_distance_osa.pyx b/editdistance/edit_distance_osa.pyx index 76c81b9..aa5dcc9 100644 --- a/editdistance/edit_distance_osa.pyx +++ b/editdistance/edit_distance_osa.pyx @@ -22,9 +22,15 @@ cdef extern from "_edit_distance_osa.hpp": double cost string output_string - vector[vector[CppEditop]] cpp_compute_all_paths(const string& a, const string& b, const map[CppEditopName, double]& cost_map) - void cpp_print_all_paths(const string& a, const string& b, const map[CppEditopName, double]& cost_map) - double cpp_compute_distance(const string& a, const string& b, const map[CppEditopName, double]& cost_map) + vector[vector[CppEditop]] cpp_compute_all_paths( + const string& a, const string& b, + double replace_weight, double insert_weight, double delete_weight, double swap_weight) + void cpp_print_all_paths( + const string& a, const string& b, + double replace_weight, double insert_weight, double delete_weight, double swap_weight) + double cpp_compute_distance( + const string& a, const string& b, + double replace_weight, double insert_weight, double delete_weight, double swap_weight) class EditopName(Enum): @@ -52,19 +58,6 @@ cdef class Editop: return f"Editop(name={self.name}, src_idx={self.src_idx}, dst_idx={self.dst_idx}, cost={self.cost}, output_string='{self.output_string}')" -cdef map[CppEditopName, double] _convert_cost_map(dict cost_map): - cdef map[CppEditopName, double] cpp_cost_map - if EditopName.INSERT in cost_map: - cpp_cost_map[INSERT] = cost_map[EditopName.INSERT] - if EditopName.DELETE in cost_map: - cpp_cost_map[DELETE] = cost_map[EditopName.DELETE] - if EditopName.REPLACE in cost_map: - cpp_cost_map[REPLACE] = cost_map[EditopName.REPLACE] - if EditopName.SWAP in cost_map: - cpp_cost_map[SWAP] = cost_map[EditopName.SWAP] - return cpp_cost_map - - def get_all_paths( str a, str b, @@ -73,16 +66,10 @@ def get_all_paths( double delete_weight=1.0, double swap_weight=1.0 ): - cdef dict cost_map = { - EditopName.REPLACE: replace_weight, - EditopName.INSERT: insert_weight, - EditopName.DELETE: delete_weight, - EditopName.SWAP: swap_weight - } cdef string cpp_a = a.encode("utf-8") cdef string cpp_b = b.encode("utf-8") - cdef map[CppEditopName, double] cpp_cost_map = _convert_cost_map(cost_map) - cdef vector[vector[CppEditop]] cpp_paths = cpp_compute_all_paths(cpp_a, cpp_b, cpp_cost_map) + cdef vector[vector[CppEditop]] cpp_paths = cpp_compute_all_paths( + cpp_a, cpp_b, replace_weight, insert_weight, delete_weight, swap_weight) python_paths = [] cdef vector[CppEditop] cpp_path cdef CppEditop cpp_op @@ -120,16 +107,10 @@ def print_all_paths( double delete_weight=1.0, double swap_weight=1.0 ): - cdef dict cost_map = { - EditopName.REPLACE: replace_weight, - EditopName.INSERT: insert_weight, - EditopName.DELETE: delete_weight, - EditopName.SWAP: swap_weight - } cdef string cpp_a = a.encode("utf-8") cdef string cpp_b = b.encode("utf-8") - cdef map[CppEditopName, double] cpp_cost_map = _convert_cost_map(cost_map) - cpp_print_all_paths(cpp_a, cpp_b, cpp_cost_map) + cpp_print_all_paths( + cpp_a, cpp_b, replace_weight, insert_weight, delete_weight, swap_weight) def compute_distance( @@ -140,13 +121,7 @@ def compute_distance( double delete_weight=1.0, double swap_weight=1.0 ): - cdef dict cost_map = { - EditopName.REPLACE: replace_weight, - EditopName.INSERT: insert_weight, - EditopName.DELETE: delete_weight, - EditopName.SWAP: swap_weight - } cdef string cpp_a = a.encode("utf-8") cdef string cpp_b = b.encode("utf-8") - cdef map[CppEditopName, double] cpp_cost_map = _convert_cost_map(cost_map) - return cpp_compute_distance(cpp_a, cpp_b, cpp_cost_map) + return cpp_compute_distance( + cpp_a, cpp_b, replace_weight, insert_weight, delete_weight, swap_weight) diff --git a/examples/benchmark.py b/examples/benchmark.py new file mode 100644 index 0000000..fa38856 --- /dev/null +++ b/examples/benchmark.py @@ -0,0 +1,61 @@ +import random +import string +import timeit + + +def random_string(length): + return "".join(random.choices(string.ascii_lowercase, k=length)) + + +def mutate_string(s, num_changes): + s = list(s) + for _ in range(num_changes): + idx = random.randrange(len(s)) + s[idx] = random.choice(string.ascii_lowercase) + return "".join(s) + + +# Generate pairs with various similarity +pairs = [ + ("kitten", "sitting"), # moderate distance + ("abcdef", "abcdef"), # identical + ("abcdef", "ghijkl"), # completely different + ("hello", "helo"), # small edit + ("", ""), # empty strings + ("a" * 20, "a" * 20), # long identical + ("a" * 20, "b" * 20), # long different +] + +# Add random pairs with controlled mutation +base = random_string(16) +pairs.append((base, mutate_string(base, 1))) # 1 change +pairs.append((base, mutate_string(base, 4))) # 4 changes +pairs.append((base, mutate_string(base, 8))) # 8 changes + + +def bench_editdistance_osa(text1, text2, number=100000): + return timeit.timeit( + stmt="editdistance.osa.compute_distance(text1, text2)", + setup=f"import editdistance.osa; text1='{text1}'; text2='{text2}'", + number=number, + ) + + +def bench_fast_damerau(text1, text2, number=100000): + return timeit.timeit( + stmt="fastDamerauLevenshtein.damerauLevenshtein(text1, text2)", + setup=f"import fastDamerauLevenshtein; text1='{text1}'; text2='{text2}'", + number=number, + ) + + +if __name__ == "__main__": + number = 10000 # Reduce for reasonable runtime + print(f"{'Pair':<30} {'OSA (s)':>10} {'FastDL (s)':>12} {'Ratio':>10}") + print("-" * 67) + for text1, text2 in pairs: + osa_time = bench_editdistance_osa(text1, text2, number) + fastdl_time = bench_fast_damerau(text1, text2, number) + ratio = osa_time / fastdl_time if fastdl_time != 0 else float("inf") + pair_desc = f"{text1[:6]}.../{text2[:6]}..." + print(f"{pair_desc:<30} {osa_time:10.4f} {fastdl_time:12.4f} {ratio:10.2f}")