-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathevaluation.py
More file actions
133 lines (104 loc) · 5.2 KB
/
evaluation.py
File metadata and controls
133 lines (104 loc) · 5.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
The point of this script is to test one random word for every template used.
The script wi
"""
import re
import random
import time
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
def make_per_template(paradigms):
""" Create a dicitonary where key = template name, values = all lemmata
of template. """
dict_per_template = {}
for w in paradigms:
for language in paradigms[w]:
template = language[0][1] # gets the template from the first lexeme of the template
if template not in dict_per_template:
dict_per_template[template] = []
dict_per_template[template].append({w:language})
return dict_per_template
def evaluate_templates(dict_per_template):
""" This function will select one random lemma from every template, and
check if the inflectional paradigm generated by the template is in
accordance with the one in the web page of the lemma. """
# now, generate a dictionary with k = template name and v = a list of inflections
# from one randomly selected lemma of the template. We will use a random lemma
# to check if the template created its inflections correctly.
temps = {}
for template in dict_per_template:
r = random.randint(0, len(dict_per_template[template])-1)
temps[template] = dict_per_template[template][r]
# Now, we will use the randomly selected lemma for each template and pull its
# Wiktionary web page. We will look if all the lexemes exist in the HTML code
# of the page.
with open("evaluation-log.tsv", 'w', encoding = 'utf-8') as log:
log.write('Template \t Word \t No. Lexemes \t No. Rights \t No. False\n')
evaluation = {}
for template in temps:
evaluation[template] = {}
total = 0
right = 0
right_list = []
false = 0
false_list = []
log.write(template + '\t')
for lemma in temps[template]:
url = "https://en.wiktionary.org/wiki/" + lemma
pause_between_request = random.randint(5, 10)
time.sleep(pause_between_request)
log.write(lemma + "\t")
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
r = session.get(url)
pulled = [re.sub('<a href=".+">', '', line) for line in r.text.split('\n')]
for word_list in temps[template][lemma]:
total += 1
exists = False
for line in pulled:
if word_list[0] in line:
exists = True
if exists:
right += 1
right_list.append(temps[template][lemma].index(word_list)) # find the index of correct template lexemes
else:
false += 1
false_list.append(temps[template][lemma].index(word_list)) # find the index of incorrect template lexemes
evaluation[template]['total'] = total # total num of lexemes
evaluation[template]['right'] = right # num of correct lexemes
evaluation[template]['false'] = false # num of wrong lexemes
evaluation[template]['right_list'] = right_list # list of indices of correct lexemes
evaluation[template]['false_list'] = false_list # list of indices of wrong lexemes
log.write(str(total) + "\t" + str(right) + "\t" + str(false) + "\n")
return evaluation
def correct_templates(templates, evaluation):
""" Once the templates are evaluated with a random lemma, the templates
will be updated and the right ones will be kept, and the partially
right ones will lose the incorrect lexemes. """
right_templates = [] # templates which are entirely correct
wrong_templates = [] # templates which are entirely false
partial_templates = {} # templates which are partially correct
for template in evaluation:
if evaluation[template]['false'] == 0:
right_templates.append(template)
elif evaluation[template]['right'] == 0:
wrong_templates.append(template)
else:
partial_templates[template] = reversed(evaluation[template]['false_list'])
corrected_templates = {}
for template in templates:
if template in right_templates:
corrected_templates[template] = templates[template]
elif template in partial_templates:
# remove the indices which correspond to incorrect lexemes
t = templates[template]
for i in partial_templates[template]:
del t[i]
corrected_templates[template] = t
return corrected_templates