Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 51 additions & 41 deletions editdistance/_edit_distance_osa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,58 +4,67 @@

std::vector<std::vector<double>> compute_dp_table(
const std::string& a,
const std::string& b,
const std::map<CppEditopName, double>& cost_map
const std::string& b,
double replace_weight,
double insert_weight,
double delete_weight,
double swap_weight
) {
int len_a = a.length();
int len_b = b.length();
std::vector<std::vector<double>> dp(len_a + 1, std::vector<double>(len_b + 1, 0.0));

for (int i = 0; i <= len_a; ++i) {
dp[i][0] = i * cost_map.at(DELETE);
dp[i][0] = i * delete_weight;
}
for (int j = 0; j <= len_b; ++j) {
dp[0][j] = j * cost_map.at(INSERT);
dp[0][j] = j * insert_weight;
}

for (int i = 1; i <= len_a; ++i) {
for (int j = 1; j <= len_b; ++j) {
double deletion = dp[i-1][j] + cost_map.at(DELETE);
double insertion = dp[i][j-1] + cost_map.at(INSERT);
double substitution_cost = (a[i-1] == b[j-1]) ? 0.0 : cost_map.at(REPLACE);
double substitution = dp[i-1][j-1] + substitution_cost;

if (a[i-1] == b[j-1]) {
dp[i][j] = dp[i-1][j-1]; // match, no cost
continue; // skip swap and other ops, match is optimal
}
double deletion = dp[i-1][j] + delete_weight;
double insertion = dp[i][j-1] + insert_weight;
double substitution = dp[i-1][j-1] + replace_weight;
dp[i][j] = std::min({deletion, insertion, substitution});

if (i > 1 && j > 1 &&
a[i-1] == b[j-2] && a[i-2] == b[j-1]) {
dp[i][j] = std::min(dp[i][j],
dp[i-2][j-2] + cost_map.at(SWAP));
dp[i-2][j-2] + swap_weight);
}
}
}

return dp;
}


double cpp_compute_distance(
const std::string& a,
const std::string& b,
const std::map<CppEditopName, double>& cost_map
const std::string& b,
double replace_weight,
double insert_weight,
double delete_weight,
double swap_weight
) {
auto dp = compute_dp_table(a, b, cost_map);
auto dp = compute_dp_table(a, b, replace_weight, insert_weight, delete_weight, swap_weight);
return dp[a.length()][b.length()];
}

std::vector<std::vector<CppEditop>> backtrack_all_paths(
const std::string& a,
const std::string& b,
const std::map<CppEditopName, double>& cost_map,
const std::vector<std::vector<double>>& dp,
int i,
int j,
std::vector<CppEditop>& current_path
std::vector<CppEditop>& current_path,
double replace_weight,
double insert_weight,
double delete_weight,
double swap_weight
) {
if (i == 0 && j == 0) {
std::vector<CppEditop> reversed_path = current_path;
Expand All @@ -67,70 +76,71 @@ std::vector<std::vector<CppEditop>> backtrack_all_paths(
double current_cost = dp[i][j];
const double tol = 1e-6;


if (i > 0 && std::abs((dp[i-1][j] + cost_map.at(DELETE)) - current_cost) < tol) {
CppEditop op(DELETE, i-1, i-1, cost_map.at(DELETE), std::string(1, a[i-1]));
if (i > 0 && std::abs((dp[i-1][j] + delete_weight) - current_cost) < tol) {
CppEditop op(DELETE, i-1, i-1, delete_weight, std::string(1, a[i-1]));
current_path.push_back(op);
auto paths = backtrack_all_paths(a, b, cost_map, dp, i-1, j, current_path);
auto paths = backtrack_all_paths(a, b, dp, i-1, j, current_path, replace_weight, insert_weight, delete_weight, swap_weight);
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
current_path.pop_back();
}

if (j > 0 && std::abs((dp[i][j-1] + cost_map.at(INSERT)) - current_cost) < tol) {
CppEditop op(INSERT, i, i, cost_map.at(INSERT), std::string(1, b[j-1]));
if (j > 0 && std::abs((dp[i][j-1] + insert_weight) - current_cost) < tol) {
CppEditop op(INSERT, i, i, insert_weight, std::string(1, b[j-1]));
current_path.push_back(op);
auto paths = backtrack_all_paths(a, b, cost_map, dp, i, j-1, current_path);
auto paths = backtrack_all_paths(a, b, dp, i, j-1, current_path, replace_weight, insert_weight, delete_weight, swap_weight);
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
current_path.pop_back();
}


if (i > 0 && j > 0) {
double sub_cost = (a[i-1] == b[j-1]) ? 0.0 : cost_map.at(REPLACE);
double sub_cost = (a[i-1] == b[j-1]) ? 0.0 : replace_weight;
if (std::abs((dp[i-1][j-1] + sub_cost) - current_cost) < tol) {
std::string out_char = (sub_cost == 0.0) ? std::string(1, a[i-1]) : std::string(1, b[j-1]);
CppEditop op(REPLACE, i-1, j-1, sub_cost, out_char);
current_path.push_back(op);
auto paths = backtrack_all_paths(a, b, cost_map, dp, i-1, j-1, current_path);
auto paths = backtrack_all_paths(a, b, dp, i-1, j-1, current_path, replace_weight, insert_weight, delete_weight, swap_weight);
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
current_path.pop_back();
}
}


if (i > 1 && j > 1 &&
a[i-1] == b[j-2] && a[i-2] == b[j-1] &&
std::abs((dp[i-2][j-2] + cost_map.at(SWAP)) - current_cost) < tol) {
std::abs((dp[i-2][j-2] + swap_weight) - current_cost) < tol) {
std::string swap_str = std::string(1, b[j-2]) + std::string(1, b[j-1]);
CppEditop op(SWAP, i-2, j-2, cost_map.at(SWAP), swap_str);
CppEditop op(SWAP, i-2, j-2, swap_weight, swap_str);
current_path.push_back(op);
auto paths = backtrack_all_paths(a, b, cost_map, dp, i-2, j-2, current_path);
auto paths = backtrack_all_paths(a, b, dp, i-2, j-2, current_path, replace_weight, insert_weight, delete_weight, swap_weight);
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
current_path.pop_back();
}

return all_paths;
}


std::vector<std::vector<CppEditop>> cpp_compute_all_paths(
const std::string& a,
const std::string& b,
const std::map<CppEditopName, double>& cost_map
const std::string& b,
double replace_weight,
double insert_weight,
double delete_weight,
double swap_weight
) {
auto dp = compute_dp_table(a, b, cost_map);
auto dp = compute_dp_table(a, b, replace_weight, insert_weight, delete_weight, swap_weight);
std::vector<CppEditop> current_path;
return backtrack_all_paths(a, b, cost_map, dp, a.length(), b.length(), current_path);
return backtrack_all_paths(a, b, dp, a.length(), b.length(), current_path, replace_weight, insert_weight, delete_weight, swap_weight);
}


void cpp_print_all_paths(
const std::string& a,
const std::string& b,
const std::map<CppEditopName, double>& cost_map
const std::string& b,
double replace_weight,
double insert_weight,
double delete_weight,
double swap_weight
) {
auto paths = cpp_compute_all_paths(a, b, cost_map);
double distance = cpp_compute_distance(a, b, cost_map);
auto paths = cpp_compute_all_paths(a, b, replace_weight, insert_weight, delete_weight, swap_weight);
double distance = cpp_compute_distance(a, b, replace_weight, insert_weight, delete_weight, swap_weight);

std::cout << "OSA Distance from '" << a << "' to '" << b << "': " << distance << std::endl;
std::cout << "Number of optimal edit sequences: " << paths.size() << std::endl;
Expand Down
35 changes: 25 additions & 10 deletions editdistance/_edit_distance_osa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,40 +29,55 @@ struct CppEditop {

std::vector<std::vector<double>> compute_dp_table(
const std::string& a,
const std::string& b,
const std::map<CppEditopName, double>& cost_map
const std::string& b,
double replace_weight,
double insert_weight,
double delete_weight,
double swap_weight
);


double cpp_compute_distance(
const std::string& a,
const std::string& b,
const std::map<CppEditopName, double>& cost_map
const std::string& b,
double replace_weight,
double insert_weight,
double delete_weight,
double swap_weight
);


std::vector<std::vector<CppEditop>> backtrack_all_paths(
const std::string& a,
const std::string& b,
const std::map<CppEditopName, double>& cost_map,
const std::vector<std::vector<double>>& dp,
int i,
int j,
std::vector<CppEditop>& current_path
std::vector<CppEditop>& current_path,
double replace_weight,
double insert_weight,
double delete_weight,
double swap_weight
);


std::vector<std::vector<CppEditop>> cpp_compute_all_paths(
const std::string& a,
const std::string& b,
const std::map<CppEditopName, double>& cost_map
const std::string& b,
double replace_weight,
double insert_weight,
double delete_weight,
double swap_weight
);


void cpp_print_all_paths(
const std::string& a,
const std::string& b,
const std::map<CppEditopName, double>& cost_map
const std::string& b,
double replace_weight,
double insert_weight,
double delete_weight,
double swap_weight
);


Expand Down
55 changes: 15 additions & 40 deletions editdistance/edit_distance_osa.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,15 @@ cdef extern from "_edit_distance_osa.hpp":
double cost
string output_string

vector[vector[CppEditop]] cpp_compute_all_paths(const string& a, const string& b, const map[CppEditopName, double]& cost_map)
void cpp_print_all_paths(const string& a, const string& b, const map[CppEditopName, double]& cost_map)
double cpp_compute_distance(const string& a, const string& b, const map[CppEditopName, double]& cost_map)
vector[vector[CppEditop]] cpp_compute_all_paths(
const string& a, const string& b,
double replace_weight, double insert_weight, double delete_weight, double swap_weight)
void cpp_print_all_paths(
const string& a, const string& b,
double replace_weight, double insert_weight, double delete_weight, double swap_weight)
double cpp_compute_distance(
const string& a, const string& b,
double replace_weight, double insert_weight, double delete_weight, double swap_weight)


class EditopName(Enum):
Expand Down Expand Up @@ -52,19 +58,6 @@ cdef class Editop:
return f"Editop(name={self.name}, src_idx={self.src_idx}, dst_idx={self.dst_idx}, cost={self.cost}, output_string='{self.output_string}')"


cdef map[CppEditopName, double] _convert_cost_map(dict cost_map):
cdef map[CppEditopName, double] cpp_cost_map
if EditopName.INSERT in cost_map:
cpp_cost_map[INSERT] = cost_map[EditopName.INSERT]
if EditopName.DELETE in cost_map:
cpp_cost_map[DELETE] = cost_map[EditopName.DELETE]
if EditopName.REPLACE in cost_map:
cpp_cost_map[REPLACE] = cost_map[EditopName.REPLACE]
if EditopName.SWAP in cost_map:
cpp_cost_map[SWAP] = cost_map[EditopName.SWAP]
return cpp_cost_map


def get_all_paths(
str a,
str b,
Expand All @@ -73,16 +66,10 @@ def get_all_paths(
double delete_weight=1.0,
double swap_weight=1.0
):
cdef dict cost_map = {
EditopName.REPLACE: replace_weight,
EditopName.INSERT: insert_weight,
EditopName.DELETE: delete_weight,
EditopName.SWAP: swap_weight
}
cdef string cpp_a = a.encode("utf-8")
cdef string cpp_b = b.encode("utf-8")
cdef map[CppEditopName, double] cpp_cost_map = _convert_cost_map(cost_map)
cdef vector[vector[CppEditop]] cpp_paths = cpp_compute_all_paths(cpp_a, cpp_b, cpp_cost_map)
cdef vector[vector[CppEditop]] cpp_paths = cpp_compute_all_paths(
cpp_a, cpp_b, replace_weight, insert_weight, delete_weight, swap_weight)
python_paths = []
cdef vector[CppEditop] cpp_path
cdef CppEditop cpp_op
Expand Down Expand Up @@ -120,16 +107,10 @@ def print_all_paths(
double delete_weight=1.0,
double swap_weight=1.0
):
cdef dict cost_map = {
EditopName.REPLACE: replace_weight,
EditopName.INSERT: insert_weight,
EditopName.DELETE: delete_weight,
EditopName.SWAP: swap_weight
}
cdef string cpp_a = a.encode("utf-8")
cdef string cpp_b = b.encode("utf-8")
cdef map[CppEditopName, double] cpp_cost_map = _convert_cost_map(cost_map)
cpp_print_all_paths(cpp_a, cpp_b, cpp_cost_map)
cpp_print_all_paths(
cpp_a, cpp_b, replace_weight, insert_weight, delete_weight, swap_weight)


def compute_distance(
Expand All @@ -140,13 +121,7 @@ def compute_distance(
double delete_weight=1.0,
double swap_weight=1.0
):
cdef dict cost_map = {
EditopName.REPLACE: replace_weight,
EditopName.INSERT: insert_weight,
EditopName.DELETE: delete_weight,
EditopName.SWAP: swap_weight
}
cdef string cpp_a = a.encode("utf-8")
cdef string cpp_b = b.encode("utf-8")
cdef map[CppEditopName, double] cpp_cost_map = _convert_cost_map(cost_map)
return cpp_compute_distance(cpp_a, cpp_b, cpp_cost_map)
return cpp_compute_distance(
cpp_a, cpp_b, replace_weight, insert_weight, delete_weight, swap_weight)
Loading