From cc32b31fd8d79d612307afe0fee44627876ca127 Mon Sep 17 00:00:00 2001 From: whoopi24 Date: Tue, 4 Mar 2025 11:59:41 +0800 Subject: [PATCH 1/3] added check if correct number of variables is returned in apply_variables() --- graphbrain/patterns/variables.pyx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/graphbrain/patterns/variables.pyx b/graphbrain/patterns/variables.pyx index 8a9303fb..e9d70f8c 100644 --- a/graphbrain/patterns/variables.pyx +++ b/graphbrain/patterns/variables.pyx @@ -128,4 +128,9 @@ def apply_variables(edge, variables): new_edge, found = apply_variable(new_edge, var_name, var_edge) if not found: return None + # added check if number of variables is correct + var_matches = all_variables(new_edge) + if len(variables) != len(var_matches): + return None + return new_edge From 6beedc2f269140b11b3350afa3758d593da0f343 Mon Sep 17 00:00:00 2001 From: whoopi24 Date: Wed, 12 Mar 2025 08:41:11 +0800 Subject: [PATCH 2/3] added oie_evaluation.py with functions for OIE evaluation --- graphbrain/patterns/oie_evaluation.py | 211 ++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 graphbrain/patterns/oie_evaluation.py diff --git a/graphbrain/patterns/oie_evaluation.py b/graphbrain/patterns/oie_evaluation.py new file mode 100644 index 00000000..be1026f0 --- /dev/null +++ b/graphbrain/patterns/oie_evaluation.py @@ -0,0 +1,211 @@ +import logging +import re + +from graphbrain import hedge +from graphbrain.utils.conjunctions import conjunctions_decomposition +from rapidfuzz import process + +from graphbrain.patterns import match_pattern + + +## functions for OIE evaluation +# function to find matching key in dictionary with string similarity +def get_closest_key(target, key_dict) -> str: + closest_match, score, _ = process.extractOne(target, key_dict.keys()) + return closest_match if score > 80 else "" + + +def edge_text(atom2word, edge): + atoms = edge.all_atoms() + # added string conversion due to matching problem + atom2word = {k.to_str(): v for k, v in atom2word.items()} + words = [ + atom2word[atom.to_str()] for atom in atoms if atom.to_str() in atom2word.keys() + ] + if len(words) == 0: + # atoms might have changed due to conjunction decomposition (especially argroles) + modified_atoms = [get_closest_key(str(atom), atom2word) for atom in atoms] + words = [atom2word[atom] for atom in modified_atoms if len(atom) > 0] + if len(words) == 0: + logging.error("matching issue (edge <> atom2word)") + logging.error(f"{edge=}") + + words.sort(key=lambda word: word[1]) + text = " ".join([word[0] for word in words]) + # remove spaces around non alpha-numeric characters + # e.g.: "passive-aggressive" instead of "passive - aggressive" + text = re.sub(" ([^a-zA-Z\\d\\s]) ", "\\g<1>", text) + return text + + +def label(edge, atom2word): + if type(edge) == list: + final_result = [] + for e in edge: + # cannot handle edges which represent the special builder atom + if e.to_str() in ["+/B.mm/.", "+/B.ma/.", "+/B.am/."]: + continue + final_result.append(edge_text(atom2word, e)) + return " ".join(final_result) + # cannot handle edges which represent the special builder atom + if edge.to_str() in ["+/B.mm/.", "+/B.ma/.", "+/B.am/."]: + return "" + return edge_text(atom2word, edge) + + +def main_conjunction(edge): + if edge.is_atom(): + return edge + if edge[0] == ":/J/.": + return edge[1] + return hedge([main_conjunction(subedge) for subedge in edge]) + + +# function to save extraction in the same format as WiRe57 data +def add_to_extractions(extractions, sent_id, arg1, rel, arg2, arg3, cnt=0): + data = {"arg1": arg1, "rel": rel, "arg2": arg2, "extractor": "shg"} + if len(arg3) > 0: + data["arg3+"] = arg3 + + # check several cases + # case 1: new sentence + if sent_id not in extractions: + if len(arg3) > 0: + data["arg3+"] = arg3 + extractions[sent_id] = [data] + logging.info(f"{arg1=}, {rel=}, {arg2=}, {arg3=}") + return cnt + 1 + # case 2: new triplet arg1, rel, arg2 for existing sentence + elif data not in extractions[sent_id]: + if len(arg3) > 0: + data["arg3+"] = arg3 + extractions[sent_id].append(data) + logging.info(f"{arg1=}, {rel=}, {arg2=}, {arg3=}") + return cnt + 1 + # case 3: new argument arg3+ for existing triplet arg1, rel, arg2 + elif len(arg3) > 0: + existing_extr = extractions[sent_id] + for idx, extr in enumerate(existing_extr): + # check if arg1, rel, arg2 are the same + if extr == data: + if "arg3+" in extr: + for arg3_item in arg3: + if arg3_item not in existing_extr[idx]["arg3+"]: + existing_extr[idx]["arg3+"] += arg3_item + logging.info(f"{arg1=}, {rel=}, {arg2=}, arg3={arg3_item}") + cnt += 1 + return cnt + else: + existing_extr[idx]["arg3+"] = arg3 + logging.info(f"{arg1=}, {rel=}, {arg2=}, {arg3=}") + return cnt + 1 + # case 4: redundant extraction + return cnt + + +def find_tuples(extractions, edge, sent_id, atom2word, patterns, max_extr): + extr_cnt = 0 + for pattern in patterns: + special = False + pattern = hedge(pattern) + atoms = pattern.atoms() + roots = {atom.root() for atom in atoms if atom.root() != "*"} + # TODO: fix issue in match_pattern() with special builder atom +/B and handle it differently + if "+" in roots: + special = True + # skip patterns with only two variables, e.g. (REL/P.{p} ARG0/C) + elif len(roots) < 3: + logging.debug("skip pattern") + continue + for match in match_pattern(edge, pattern): + logging.debug(f"{pattern=}") + logging.debug(f"{match=}") + # stop extracting triplets after certain number of extractions + if extr_cnt == max_extr: + return + elif special: + logging.debug("special builder atom pattern") + # skip missing/incomplete matches + elif not match or len(match) != len(roots): + logging.debug("skip match") + continue + + # attention: arg1 = ARG0; arg2 = ARG1 for LSOIE patterns + if "ARG0" in match: + arg1 = label(match["ARG0"], atom2word) + arg2 = label(match["ARG1"], atom2word) + + if "ARG2" in match.keys(): + arg3 = [label(match["ARG2"], atom2word)] + else: + arg3 = [] + + # the following case is rare + if "ARG3" in match.keys(): + arg3.append(label(match["ARG3"], atom2word)) + + # else for original SH patterns with arg1 = ARG1, arg2 = ARG2 + else: + arg1 = label(match["ARG1"], atom2word) + arg2 = label(match["ARG2"], atom2word) + + if "ARG3..." in match: + arg3 = [label(match["ARG3..."], atom2word)] + else: + arg3 = [] + + # connect all parts of relation extractions (REL1/REL2/REL3) if available + if "REL1" in match: + rel_parts = [] + i = 1 + while "REL{}".format(i) in match: + rel_parts.append(label(match["REL{}".format(i)], atom2word)) + i += 1 + rel = " ".join(rel_parts) + elif special: + rel = "is" + else: + rel = label(match["REL"], atom2word) + + extr_cnt = add_to_extractions( + extractions, sent_id, arg1, rel, arg2, arg3, extr_cnt + ) + + +def information_extraction( + extractions, main_edge, sent_id, atom2word, patterns, max_extr +): + if main_edge.is_atom(): + return + if main_edge.connector_type() == "B": + logging.debug(f"find_tuples() for {main_conjunction(main_edge)=}") + find_tuples( + extractions, + main_conjunction(main_edge), + sent_id, + atom2word, + patterns, + max_extr, + ) + if main_edge.type()[0] == "R": + try: + edges = conjunctions_decomposition(main_edge, concepts=True) + for edge in edges: + logging.debug(f"find_tuples() for {main_conjunction(edge)=}") + find_tuples( + extractions, + main_conjunction(edge), + sent_id, + atom2word, + patterns, + max_extr, + ) + except IndexError: + # e.g. IndexError: tuple index out of range: if edge[0].type() == 'J' and edge.mtype() != 'C': + logging.error( + f"Issue with conjunction decomposition for one of the subedges of {main_edge=}" + ) + for edge in main_edge: + information_extraction( + extractions, edge, sent_id, atom2word, patterns, max_extr + ) From 5175475593585b2af34d3830a3e95dd26e7425d4 Mon Sep 17 00:00:00 2001 From: whoopi24 Date: Wed, 19 Mar 2025 11:37:17 +0800 Subject: [PATCH 3/3] change location of new script --- {graphbrain/patterns => scripts}/oie_evaluation.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {graphbrain/patterns => scripts}/oie_evaluation.py (100%) diff --git a/graphbrain/patterns/oie_evaluation.py b/scripts/oie_evaluation.py similarity index 100% rename from graphbrain/patterns/oie_evaluation.py rename to scripts/oie_evaluation.py