-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathutils.py
More file actions
106 lines (75 loc) · 3.44 KB
/
utils.py
File metadata and controls
106 lines (75 loc) · 3.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pickle
from sklearn.feature_extraction.text import CountVectorizer
def read_data(need_dict=False):
with open("./data/preprocessed/ted/ted_en.pkl", "rb") as f:
ted_en = pickle.load(f)
with open("./data/preprocessed/ted/ted_fr.pkl", "rb") as f:
ted_fr = pickle.load(f)
with open("./data/preprocessed/ted/ted_ru.pkl", "rb") as f:
ted_ru = pickle.load(f)
with open("./data/preprocessed/twitter/twitter_en.pkl", "rb") as f:
twitter_en = pickle.load(f)
with open("./data/preprocessed/twitter/twitter_fr.pkl", "rb") as f:
twitter_fr = pickle.load(f)
with open("./data/preprocessed/twitter/twitter_ru.pkl", "rb") as f:
twitter_ru = pickle.load(f)
if need_dict:
with open("./data/dicts/EN_FR.pkl", "rb") as f:
dict_en_fr = pickle.load(f)
with open("./data/dicts/EN_RU.pkl", "rb") as f:
dict_en_ru = pickle.load(f)
return ted_en, ted_fr, ted_ru, twitter_en, twitter_fr, twitter_ru, dict_en_fr, dict_en_ru
else:
return ted_en, ted_fr, ted_ru, twitter_en, twitter_fr, twitter_ru
def texts_to_tdm(texts):
"""
converts list of texts to term-document-matrix
:param texts: list of texts
:return: sparse matrix and feature names
"""
cv = CountVectorizer(lowercase=True, ngram_range=(1, 1))
return cv.fit_transform(texts), cv.get_feature_names()
def clean_dict(dictionary, vocab_src, vocab_trans, lang, write=False):
ind_map_src = {vocab_src[i]: i for i in range(len(vocab_src))}
ind_map_trans = {vocab_trans[i]: i for i in range(len(vocab_trans))}
dictionary = [[ind_map_src[pair[0]], ind_map_trans[pair[1]]] for pair in dictionary if
pair[0] in vocab_src and
pair[1] in vocab_trans]
if write:
import pickle
with open(f"./data/dicts/clean_dict_en_{lang}.pkl", "wb") as f:
pickle.dump(dictionary, f)
else:
return dictionary, ind_map_src, ind_map_trans
def prepare_data_for_pltm():
ted_en, ted_fr, ted_ru, twitter_en, twitter_fr, twitter_ru = read_data(need_dict=False)
# hardcoding texts with non-utf chars (otherwise pltm drops erroe while processing french)
ted_en.pop(573)
ted_en.pop(906)
ted_fr.pop(573)
ted_fr.pop(906)
ted_ru.pop(573)
ted_ru.pop(906)
twitter_en = [text.replace("\n", " ")for text in twitter_en]
twitter_fr = [text.replace("\n", " ")for text in twitter_fr]
twitter_ru = [text.replace("\n", " ") for text in twitter_ru]
with open('./data/preprocessed/ted/ted_en.txt', 'w') as f:
for item in ted_en:
f.write("%s\n" % item)
with open('./data/preprocessed/ted/ted_fr.txt', 'w') as f:
for item in ted_fr:
f.write("%s\n" % item)
with open('./data/preprocessed/ted/ted_ru.txt', 'w') as f:
for item in ted_ru:
f.write("%s\n" % item)
print("Added txt files of ted files to ./data/preprocessed/ted")
with open('./data/preprocessed/twitter/twitter_en.txt', 'w') as f:
for item in twitter_en[:3098]:
f.write("%s\n" % item)
with open('./data/preprocessed/twitter/twitter_fr.txt', 'w') as f:
for item in twitter_fr[:3099]:
f.write("%s\n" % item)
with open('./data/preprocessed/twitter/twitter_ru.txt', 'w') as f:
for item in twitter_ru[:3100]:
f.write("%s\n" % item)
print("Added txt files of twitter files to ./data/preprocessed/twitter")