Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 19 additions & 7 deletions editdistance/_edit_distance_osa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,19 +91,30 @@ std::vector<std::vector<CppEditop>> backtrack_all_paths(
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
current_path.pop_back();
}

if (i > 0 && j > 0) {
double sub_cost = (a[i-1] == b[j-1]) ? 0.0 : replace_weight;
if (std::abs((dp[i-1][j-1] + sub_cost) - current_cost) < tol) {
std::string out_char = (sub_cost == 0.0) ? std::string(1, a[i-1]) : std::string(1, b[j-1]);
CppEditop op(REPLACE, i-1, j-1, sub_cost, out_char);

if (i > 0 && j > 0 && a[i-1] != b[j-1]) {
if (std::abs((dp[i-1][j-1] + replace_weight) - current_cost) < tol) {
std::string out_char = std::string(1, b[j-1]);
CppEditop op(REPLACE, i-1, j-1, replace_weight, out_char);
current_path.push_back(op);
auto paths = backtrack_all_paths(a, b, dp, i-1, j-1, current_path, replace_weight, insert_weight, delete_weight, swap_weight);
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
current_path.pop_back();
}
}


if (i > 0 && j > 0 && a[i-1] == b[j-1]) {
double match_weight = 0.0; // We might want to make this non-zero in the future
if (std::abs((dp[i-1][j-1] + match_weight) - current_cost) < tol) {
std::string out_char = std::string(1, a[i-1]);
CppEditop op(MATCH, i-1, j-1, match_weight, out_char);
current_path.push_back(op);
auto paths = backtrack_all_paths(a, b, dp, i-1, j-1, current_path, replace_weight, insert_weight, delete_weight, swap_weight);
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
current_path.pop_back();
}
}

if (i > 1 && j > 1 &&
a[i-1] == b[j-2] && a[i-2] == b[j-1] &&
std::abs((dp[i-2][j-2] + swap_weight) - current_cost) < tol) {
Expand Down Expand Up @@ -161,6 +172,7 @@ std::string editop_name_to_string(CppEditopName name) {
case DELETE: return "DELETE";
case REPLACE: return "REPLACE";
case SWAP: return "SWAP";
case MATCH: return "MATCH";
default: return "UNKNOWN";
}
}
Expand Down
3 changes: 2 additions & 1 deletion editdistance/_edit_distance_osa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ enum CppEditopName {
INSERT,
DELETE,
REPLACE,
SWAP
SWAP,
MATCH
};

struct CppEditop {
Expand Down
24 changes: 15 additions & 9 deletions editdistance/edit_distance_osa.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# distutils: language = c++
# distutils: sources = ./editdistance/_edit_distance_osa.cpp

from libcpp cimport bool
from libcpp.map cimport map
from libcpp.string cimport string
from libcpp.vector cimport vector
Expand All @@ -14,6 +15,7 @@ cdef extern from "_edit_distance_osa.hpp":
DELETE
REPLACE
SWAP
MATCH

cdef struct CppEditop:
CppEditopName name
Expand All @@ -38,6 +40,7 @@ class EditopName(Enum):
DELETE = 1
REPLACE = 2
SWAP = 3
MATCH = 4


cdef class Editop:
Expand All @@ -64,7 +67,8 @@ def get_all_paths(
double replace_weight=1.0,
double insert_weight=1.0,
double delete_weight=1.0,
double swap_weight=1.0
double swap_weight=1.0,
bool return_matches=False,
):
cdef string cpp_a = a.encode("utf-8")
cdef string cpp_b = b.encode("utf-8")
Expand All @@ -76,8 +80,6 @@ def get_all_paths(
for cpp_path in cpp_paths:
python_path = []
for cpp_op in cpp_path:
if cpp_op.cost == 0:
continue
if cpp_op.name == INSERT:
py_name = EditopName.INSERT
elif cpp_op.name == DELETE:
Expand All @@ -86,6 +88,11 @@ def get_all_paths(
py_name = EditopName.REPLACE
elif cpp_op.name == SWAP:
py_name = EditopName.SWAP
elif cpp_op.name == MATCH:
if return_matches:
py_name = EditopName.MATCH
else:
continue
else:
py_name = None
python_path.append(Editop(
Expand All @@ -99,12 +106,11 @@ def get_all_paths(
return python_paths

def apply_editops(src, dst, editops):
# assumes editops are sorted from left to right
# assumes match operations are included
src_idx = 0
s = ""
for op in editops:
while src_idx < op.src_idx:
s += src[src_idx]
src_idx += 1
if op.name == EditopName.INSERT:
s += dst[op.dst_idx]
elif op.name == EditopName.DELETE:
Expand All @@ -116,9 +122,9 @@ def apply_editops(src, dst, editops):
s += src[op.src_idx + 1]
s += src[op.src_idx]
src_idx += 2
while src_idx < len(src):
s += src[src_idx]
src_idx += 1
elif op.name == EditopName.MATCH:
s += src[op.src_idx]
src_idx += 1
return s


Expand Down
17 changes: 11 additions & 6 deletions examples/osa_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@
"""

try:
from editdistance.osa import (
compute_distance,
get_all_paths,
)
from editdistance.osa import compute_distance, get_all_paths

def main():
# Test case from original Python code
Expand All @@ -25,12 +22,20 @@ def main():
print(f"Distance: {distance}")

paths = get_all_paths(source, target)
paths_with_matches = get_all_paths(source, target, return_matches=True)
print(f"Number of optimal edit sequences: {len(paths)}")

print("Paths without match editops:")
for i, path in enumerate(paths, 1):
print(f"Path {i}:")
print(f" Path {i}:")
for op in path:
print(f" {op}")
print(f" {op}")
print()
print("Paths with match editops:")
for i, path in enumerate(paths_with_matches, 1):
print(f" Path {i}:")
for op in path:
print(f" {op}")
print()

if __name__ == "__main__":
Expand Down
Empty file added tests/__init__.py
Empty file.
8 changes: 2 additions & 6 deletions tests/tests_osa.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import unittest

from editdistance.osa import (
apply_editops,
compute_distance,
get_all_paths,
)
from editdistance.osa import apply_editops, compute_distance, get_all_paths

COMPUTE_DISTANCE_TEST_CASES = [
("single character", "a", "b", 1.0),
Expand Down Expand Up @@ -76,7 +72,7 @@ def test_get_all_paths(self):
def test_editops_transform(self):
for src, dst in EDITOPS_TRANSFORM_TEST_CASES:
with self.subTest(src=src, dst=dst):
paths = get_all_paths(src, dst)
paths = get_all_paths(src, dst, return_matches=True)
self.assertTrue(paths, f"No paths found for {src} -> {dst}")
for path in paths:
result = apply_editops(src, dst, path)
Expand Down