Wikinflection/evaluation.py at master · lenakmeth/Wikinflection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
The point of this script is to test one random word for every template used.
The script wi
"""

import re
import random
import time
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry


def make_per_template(paradigms):
    """ Create a dicitonary where key = template name, values = all lemmata
        of template. """

    dict_per_template = {}

    for w in paradigms:
        for language in paradigms[w]:
            template = language[0][1] # gets the template from the first lexeme of the template
            if template not in dict_per_template:
                dict_per_template[template] = []
            dict_per_template[template].append({w:language})

    return dict_per_template


def evaluate_templates(dict_per_template):
    """ This function will select one random lemma from every template, and
        check if the inflectional paradigm generated by the template is in
        accordance with the one in the web page of the lemma. """

    # now, generate a dictionary with k = template name and v = a list of inflections
    # from one randomly selected lemma of the template. We will use a random lemma
    # to check if the template created its inflections correctly.

    temps = {}
    for template in dict_per_template:
        r = random.randint(0, len(dict_per_template[template])-1)
        temps[template] = dict_per_template[template][r]

    # Now, we will use the randomly selected lemma for each template and pull its
    # Wiktionary web page. We will look if all the lexemes exist in the HTML code
    # of the page.

    with open("evaluation-log.tsv", 'w', encoding = 'utf-8') as log:
        log.write('Template \t Word \t No. Lexemes \t No. Rights \t No. False\n')
        evaluation = {}

        for template in temps:
            evaluation[template] = {}
            total = 0
            right = 0
            right_list = []
            false = 0
            false_list = []
            log.write(template + '\t')

            for lemma in temps[template]:
                url = "https://en.wiktionary.org/wiki/" + lemma

                pause_between_request = random.randint(5, 10)
                time.sleep(pause_between_request)

                log.write(lemma + "\t")

                session = requests.Session()
                retry = Retry(connect=3, backoff_factor=0.5)
                adapter = HTTPAdapter(max_retries=retry)
                session.mount('http://', adapter)
                session.mount('https://', adapter)
                r = session.get(url)
                pulled = [re.sub('<a href=".+">', '', line) for line in r.text.split('\n')]


                for word_list in temps[template][lemma]:
                    total += 1
                    exists = False
                    for line in pulled:
                        if word_list[0] in line:
                            exists = True
                    if exists:
                        right += 1
                        right_list.append(temps[template][lemma].index(word_list)) # find the index of correct template lexemes
                    else:
                        false += 1
                        false_list.append(temps[template][lemma].index(word_list)) # find the index of incorrect template lexemes

            evaluation[template]['total'] = total # total num of lexemes
            evaluation[template]['right'] = right # num of correct lexemes
            evaluation[template]['false'] = false # num of wrong lexemes
            evaluation[template]['right_list'] = right_list # list of indices of correct lexemes
            evaluation[template]['false_list'] = false_list # list of indices of wrong lexemes

            log.write(str(total) + "\t" + str(right) + "\t" + str(false) + "\n")

    return evaluation


def correct_templates(templates, evaluation):
    """ Once the templates are evaluated with a random lemma, the templates
        will be updated and the right ones will be kept, and the partially
        right ones will lose the incorrect lexemes. """

    right_templates = [] # templates which are entirely correct
    wrong_templates = [] # templates which are entirely false
    partial_templates = {} # templates which are partially correct

    for template in evaluation:
        if evaluation[template]['false'] == 0:
            right_templates.append(template)
        elif evaluation[template]['right'] == 0:
            wrong_templates.append(template)
        else:
            partial_templates[template] = reversed(evaluation[template]['false_list'])

    corrected_templates = {}

    for template in templates:
        if template in right_templates:
            corrected_templates[template] = templates[template]
        elif template in partial_templates:
            # remove the indices which correspond to incorrect lexemes
            t = templates[template]
            for i in partial_templates[template]:
                del t[i]
            corrected_templates[template] = t

    return corrected_templates