-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsyntaxparser.py
More file actions
122 lines (89 loc) · 3.68 KB
/
syntaxparser.py
File metadata and controls
122 lines (89 loc) · 3.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import spacy
import json
class SyntaxParser:
LANGUAGE = "en"
CUSTOM_PHRASES = "syntax/customphrases.json"
PLURAL_PHRASES = "syntax/pluralphrases.json"
def __init__(self):
self.spacy = spacy.load(SyntaxParser.LANGUAGE)
self.custom_phrases = SyntaxParser.load_custom_phrases()
self.plural_phrases = SyntaxParser.load_plural_phrases()
def parse(self, question):
question.set_syntax(self.spacy(question.text))
self.merge_spans(question.syntax)
if self.has_multiple_answers(question.syntax):
question.set_multiple_answers()
SyntaxParser.find_root(question)
@staticmethod
def find_root(question):
for token in question.syntax:
if token.dep_ == "ROOT":
question.set_syntax_root_index(token.i)
return
@staticmethod
def load_custom_phrases():
return json.load(open(SyntaxParser.CUSTOM_PHRASES))["phrases"]
@staticmethod
def load_plural_phrases():
return json.load(open(SyntaxParser.PLURAL_PHRASES))["phrases"]
def has_multiple_answers(self, syntax):
for doc_idx in range(len(syntax)):
for phrase in self.plural_phrases:
word_list = phrase.split()
if doc_idx + len(word_list) > len(syntax):
continue
match = True
for word_idx in range(len(word_list)):
if syntax[doc_idx + word_idx].lemma_ != word_list[word_idx]:
match = False
break
if match:
return True
return False
def merge_spans(self, syntax):
for spanType in range(3):
spans = []
if spanType == 1:
raw_spans = list(syntax.ents)
elif spanType == 2:
raw_spans = list(syntax.noun_chunks)
else:
raw_spans = self.make_custom_spans(syntax)
for span in raw_spans:
changed = False
for idx in range(len(span)):
token = span[idx]
if token.dep_ == "poss": # Do not merge noun chunks with a genitive in it. We need that.
changed = True
spans.append(span[0: idx + 1])
if idx + 2 < len(span): # exception for the case of "New York's"
spans.append(span[idx + 2:])
break
if not changed:
spans.append(span)
for span in spans:
if len(span) == 1:
continue
if span[0].dep_ == "det": # exclude the first determiner, we don't want it.
span[1:].merge()
elif len(span) > 2 and span[0].text == "how" and span[1].text == "many":
span[2:].merge()
else:
span.merge()
def make_custom_spans(self, syntax):
phrase_definitions = self.custom_phrases
result = []
for doc_idx in range(len(syntax)):
for phrase in phrase_definitions:
words = phrase.split()
if doc_idx + len(words) > len(syntax):
continue
match = True
for word_idx in range(len(words)):
if syntax[doc_idx + word_idx].lemma_ != words[word_idx]:
match = False
break
if match is True:
# print(syntax[doc_idx: doc_idx + len(words)])
result.append(syntax[doc_idx: doc_idx + len(words)])
return result