forked from xinyangATK/ProGBN
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild_adj.py
More file actions
91 lines (74 loc) · 2.6 KB
/
build_adj.py
File metadata and controls
91 lines (74 loc) · 2.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import copy
import numpy as np
import torch
import pickle
import os
from nltk.corpus import wordnet as wn
import torchtext.vocab as vocab
import scipy.sparse as sp
# data_file = './data/R8/r8.pkl'
# with open(data_file, 'rb') as f:
# data = pickle.load(f)
def cosine_simlarity(A, B):
# A: N*D, B: N*D
realmin = 2.2e-10
[N, D] = A.shape
inter_product = np.matmul(A, np.transpose(B)) # N*N
len_A = np.sqrt(np.sum(A * A, axis=1, keepdims=True))
len_B = np.sqrt(np.sum(B * B, axis=1, keepdims=True))
len_AB = np.matmul(len_A, np.transpose(len_B))
cos_AB = inter_product / (len_AB + realmin)
cos_AB[(np.arange(N), np.arange(N))] = 1
return cos_AB
def get_word_embedding_from_glove(cache_dir):
glove = vocab.GloVe(name='6B', dim=300, cache=cache_dir)
return glove
def build_adj_matrix(dataset_dir, adj_save_dir, word_emb):
m = [2, 4, 6, 8]
adj = {}
threshold = [0.6, 0.5, 0.35, 0.2]
with open(dataset_dir, 'rb') as f:
data = pickle.load(f)
voc = data['voc20000']
voc_id = []
ignore_id = []
for i in range(len(voc)):
if voc[i] not in word_emb.stoi:
voc_id.append(word_emb.stoi['.'])
print(voc[i])
ignore_id.append(i)
continue
voc_id.append(word_emb.stoi[voc[i]])
voc_vector = word_emb.vectors[voc_id].numpy()
voc_vector[ignore_id] = np.zeros([len(ignore_id), 300])
voc_adj = cosine_simlarity(voc_vector, voc_vector)
for layer in range(len(m)):
adj_t = np.zeros_like(voc_adj)
c = 0
for i in range(len(voc)):
if i in ignore_id:
adj_t[i][i] = 1
continue
v = np.sort(voc_adj[i])[::-1][m[layer]]
if v < threshold[layer]:
v = threshold[layer]
for j in range(len(voc)):
if voc_adj[i][j] >= v:
c += 1
adj_t[i][j] = 1
else:
adj_t[i][j] = 0
adj['layer_' + str(layer + 1)] = sp.csr_matrix(adj_t)
print('====> Layer {} has been built.'.format(layer + 1))
f = open(adj_save_dir, 'wb')
pickle.dump(adj, f)
f.close()
if __name__ == '__main__':
cache_dir = './data/vector/glove/'
dataset_dir = './data/20ng/20ng.pkl'
adj_save_dir = './data/20ng/20ng_adj.pkl'
print('===> Load word embedding from glove')
word_emb = get_word_embedding_from_glove(cache_dir=cache_dir)
print('===> Build adj matrix from data')
build_adj_matrix(dataset_dir=dataset_dir, adj_save_dir=adj_save_dir, word_emb=word_emb)
print('===> Build over!!!')