-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathcontextualize.py
More file actions
55 lines (50 loc) · 2.44 KB
/
contextualize.py
File metadata and controls
55 lines (50 loc) · 2.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import sys
import re
import argparse
import engdict as eng
import readwrite as rw
import glossarygen
def contextualize(target, source, length, *words):
h = []
for i in range(len(words)):
if target == words[i].strip(",.][?!:") and i >= length: h.append([words[i-length:i], words[i], words[i+1:i+length+1], source])
elif target == words[i].strip(",.][?!:") and i < length: h.append([words[0:i], words[i], words[i+1:i+length+1], source])
return h
def align_contexts(*contexts):
h = []
lef = 0
cen = 0
rig = 0
for c in contexts:
if len(" ".join(c[0])) > lef: lef = len(" ".join(c[0]))
if len(" ".join(c[1])) > cen: cen = len(" ".join(c[1]))
if len(" ".join(c[2])) > rig: rig = len(" ".join(c[2]))
for c in contexts: h.append(f"{' '.join(c[0]):>{lef}} {c[1]:{cen}} {' '.join(c[2]):>{rig}} {c[3]}")
return h
def full_context(texts, titles, length, lemmata):
#assuming lemmata as in glossarygen.glossify() with lemma:[freq, pos, gloss, [(token, analysis)]]
#assuming texts as [[text1]...]
h = []
for lemma in sorted(lemmata):
contexts = []
for token in lemmata[lemma][-1]:
for i in range(len(texts)):
for sent in texts[i]:
if token[0] in sent: contexts.extend(contextualize(token[0], titles[i], length, *sent.split(" ")))
h.append(" ".join([lemma, lemmata[lemma][1], lemmata[lemma][2]]))
h.extend(align_contexts(*contexts))
h.append("")
return h
if __name__ == "__main__":
#todo: move this to a wrapper that makes both the glossary and its contextualization with one function call (with a "make context" flag raised), use argparse instead of sys.argv[]
args = glossarygen.parseargs()
text = []
titles = []
for t in args.input_files:
titles.append(t.split('/')[-1])
text.append(rw.burn_metadata(2, *rw.readin(t)))
merged = []
for t in text: merged.extend(t)
glossary = glossarygen.glossify(args.fst_file, args.spellrelax_file, args.fst_format, "".join(rw.readin(args.pos_regex)), eng.mk_glossing_dict(*rw.readin(args.gloss_file)), merged) #fst, spellrelaxfst, fstformat, regexfile, glossfile, mega text
#for g in glossary: print(g, glossary[g])
rw.writeout("glossary_with_contexts.txt", *full_context(text, titles, 5, glossary)) #todo: don't hardcode output title, make it be derivative of whatever the glossary is named, else a sensible default