This repository was archived by the owner on Jan 3, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathdictionary.ru.py
More file actions
executable file
·101 lines (80 loc) · 4.17 KB
/
dictionary.ru.py
File metadata and controls
executable file
·101 lines (80 loc) · 4.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python3
__author__ = 'Dmitry Ustalov'
import argparse
import csv
import random
from collections import defaultdict
import numpy as np
from gensim.models.word2vec import Word2Vec
try:
from sklearn.model_selection import train_test_split
except ImportError:
from sklearn.cross_validation import train_test_split
parser = argparse.ArgumentParser(description='Russian Dictionary.')
parser.add_argument('--w2v', default='all.norm-sz100-w10-cb0-it1-min100.w2v', nargs='?',
help='Path to the word2vec model.')
parser.add_argument('--seed', default=228, type=int, nargs='?', help='Random seed.')
args = vars(parser.parse_args())
RANDOM_SEED = args['seed']
random.seed(RANDOM_SEED)
w2v = Word2Vec.load_word2vec_format(args['w2v'], binary=True, unicode_errors='ignore')
w2v.init_sims(replace=True)
print('Using %d word2vec dimensions from "%s".' % (w2v.layer1_size, args['w2v']))
hypernyms_patterns = defaultdict(lambda: list())
hypernyms_wiktionary = defaultdict(lambda: list())
synonyms = defaultdict(lambda: list())
with open('pairs-isas-aa.csv') as f:
reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in reader:
hyponym, hypernym, frequency = row['hyponym'], row['hypernym'], float(row['freq'])
if frequency < 100:
continue
if hyponym in w2v and hypernym in w2v and hypernym not in hypernyms_patterns[hyponym]:
hypernyms_patterns[hyponym].append(hypernym)
with open('all_ru_pairs_ruwikt20160210_parsed.txt') as f:
reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in reader:
hyponym, hypernym = None, None
if row[3] == 'hypernyms':
hyponym, hypernym = row[1], row[2]
elif row[3] == 'hyponyms':
hyponym, hypernym = row[2], row[1]
elif row[3] == 'synonyms':
if row[1] in w2v and row[2] in w2v:
if row[2] not in synonyms[row[1]]:
synonyms[row[1]].append(row[2])
if row[1] not in synonyms[row[2]]:
synonyms[row[2]].append(row[1])
continue
else:
continue
if hypernym not in hypernyms_wiktionary[hyponym] and hyponym in w2v and hypernym in w2v:
hypernyms_wiktionary[hyponym].append(hypernym)
keys_wiktionary = [k for k in hypernyms_wiktionary.keys() if len(hypernyms_wiktionary[k]) > 0]
wiktionary_train, wiktionary_validation_test = train_test_split(np.arange(len(keys_wiktionary), dtype='int32'),
test_size=.4, random_state=RANDOM_SEED)
wiktionary_validation, wiktionary_test = train_test_split(wiktionary_validation_test, test_size=.5,
random_state=RANDOM_SEED)
hypernyms_train = {k: hypernyms_wiktionary[k] for i in wiktionary_train for k in (keys_wiktionary[i],)}
for hyponym, hypernyms in hypernyms_patterns.items():
if hyponym in hypernyms_train:
for hypernym in hypernyms:
if not hypernym in hypernyms_train[hyponym]:
hypernyms_train[hyponym].append(hypernym)
hypernyms_validation = {k: hypernyms_wiktionary[k] for i in wiktionary_validation for k in (keys_wiktionary[i],)}
hypernyms_test = {k: hypernyms_wiktionary[k] for i in wiktionary_test for k in (keys_wiktionary[i],)}
subsumptions_train = [(x, y) for x, ys in hypernyms_train.items() for y in ys]
subsumptions_validation = [(x, y) for x, ys in hypernyms_validation.items() for y in ys]
subsumptions_test = [(x, y) for x, ys in hypernyms_test.items() for y in ys]
def write_subsumptions(subsumptions, filename):
with open(filename, 'w') as f:
writer = csv.writer(f, dialect='excel-tab', lineterminator='\n')
for pair in subsumptions:
writer.writerow(pair)
write_subsumptions(subsumptions_train, 'subsumptions-train.txt')
write_subsumptions(subsumptions_validation, 'subsumptions-validation.txt')
write_subsumptions(subsumptions_test, 'subsumptions-test.txt')
with open('synonyms.txt', 'w') as f:
writer = csv.writer(f, dialect='excel-tab', lineterminator='\n')
for word, words in synonyms.items():
writer.writerow((word, ','.join(words)))