-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathkeygraph.py
More file actions
197 lines (195 loc) · 5.95 KB
/
keygraph.py
File metadata and controls
197 lines (195 loc) · 5.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import os
import networkx as nx
import matplotlib.pyplot as plt
import time
M_1 = 10
M_2 = 4
fname = input('File name?')
if len(fname)<1:
fname = '28k.txt'
fhand = open(fname)
time1 = time.time()
sentences = []
tem = []
for line in fhand:
line = line.strip('\n')
sentence = line.split('。')
tem = tem + sentence
for sentence in tem:
if sentence == '':continue
sentences.append(sentence)
word_set = {}
#word_set:{(word,sentence_num):times}
sentence_num = 0
for sentence in sentences:
tem = os.popen('echo '+sentence+' | juman | knp -simple').readlines()
for details in tem:
if details.startswith('*') or details.startswith('#') or details.startswith('+'):continue
knpresult = details.split()
if '普通名詞' not in knpresult:continue
for keys in knpresult:
if not keys.startswith('"代表表記:'):continue
name = keys.split(':')
word = name[1]
if word.endswith('v'):continue
word_set[(word,sentence_num)] = word_set.get((word,sentence_num),0)+1
sentence_num = sentence_num+1
time2 = time.time()
words = {}
#words:{word:times}
for word,times in word_set.items():
words[word[0]] = words.get(word[0],0)+times
tem = []
for word,times in words.items():
tem.append((times,word))
tem.sort(reverse = True)
m_1 = min(M_1,len(words))
tem = tem[:m_1]
HighFreq = []
for (times,word) in tem:
HighFreq.append(word)
co_occurrence = {}
co_tem = {}
for word1,times1 in word_set.items():
for word2,times2 in word_set.items():
if word1[1] != word2[1]:continue
if word1[0] == word2[0]:continue
co = times1 * times2
co_occurrence[(word1[0],word2[0])] = co_occurrence.get((word1[0],word2[0]),0)+co
for pairs,co in co_occurrence.items():
if (pairs[1],pairs[0]) in co_tem:continue
co_tem[pairs[0],pairs[1]] = co
co_set = {}
#co_set:{(word1,word2):co}
for word1 in HighFreq:
for word2 in HighFreq:
if (word1,word2) in co_tem:
co_set[(word1,word2)] = co_tem[(word1,word2)]
tem = []
for pairs,co in co_set.items():
tem.append((co,pairs))
tem.sort(reverse = True)
tem = tem[:m_1-1]
links = []
for (co,pairs) in tem:
links.append(pairs)
foundations = {}
for word in HighFreq:
if word not in foundations:
foundations[word] = []
for (word1,word2) in links:
foundations[word1].append(word2)
foundations[word2].append(word1)
'''
foundations_short = {}
for word in foundations:
if foundations[word] == []:continue
foundations_short[word] = foundations[word]
'''
G = nx.Graph(foundations)
graphs = list(nx.connected_component_subgraphs(G))
g_s_set = {}
#g_s_set:{(graph_num,sentence_num):g_s}
for word,times in word_set.items():
for i in range(len(graphs)):
if word[0] not in graphs[i].nodes():continue
g_s_set[(i,word[1])]=g_s_set.get((i,word[1]),0) + times
based_set = {}
#based_set:{(word,graph_num):based}
for i in range(len(graphs)):
for word,times in word_set.items():
if word[0] in graphs[i].nodes():
g_minus_w = g_s_set[(i,word[1])] - times
else:
g_minus_w = times
based = times * g_minus_w
based_set[(word[0],i)] = based_set.get((word[0],i),0)+based
neighbors_set = {}
#neighbors_set:{graph_num:neighbors}
for i in range(len(graphs)):
for word,times in word_set.items():
if word[0] in graphs[i].nodes():
g_minus_w = g_s_set[(i,word[1])] - times
else:
g_minus_w = times
neighbors_set[i] = neighbors_set.get(i,0) + g_minus_w
key_set = {}
tem_set = {}
for word in word_set:
for i in range(len(graphs)):
based = based_set[(word[0],i)]
neighbors = neighbors_set[i]
tem = 1 - based/neighbors
tem_set[word[0]] = tem_set.get(word[0],1) * tem
key_set[word[0]] = 1 - tem
m_2 = min(M_2,len(words))
tem = []
for word,key in key_set.items():
tem.append((key,word))
tem.sort(reverse = True)
tem = tem[:m_2]
HighKey = []
for (key,word) in tem:
HighKey.append(word)
keygraph = foundations
for word in HighKey:
if word not in keygraph:
keygraph[word] = []
G = nx.Graph(keygraph)
graphs = list(nx.connected_component_subgraphs(G))
new_links = []
for word in HighKey:
for i in range(len(graphs)):
if word in graphs[i].nodes():
mark = i
for i in range(len(graphs)):
tem = []
if i == mark:continue
for node in graphs[i].nodes():
if (word,node) in co_occurrence:
column = co_occurrence[(word,node)]
tem.append((column,(word,node)))
tem.sort(reverse = True)
new_links.append(tem[0][1])
for (word1,word2) in new_links:
keygraph[word1].append(word2)
keygraph[word2].append(word1)
c_set = {}
#c_set:{(word1,word2):column}
for word1 in HighKey:
for word2 in HighFreq:
if (word1,word2) in co_occurrence:
c_set[(word1,word2)] = co_occurrence[(word1,word2)]
tem_set = {}
for pairs,c in c_set.items():
tem_set[pairs[0]] = tem_set.get(pairs[0],0)+c
tem_set[pairs[1]] = tem_set.get(pairs[1],0)+c
tem = []
for pair,c in tem_set.items():
tem.append((c,pair))
tem.sort(reverse = True)
tem = tem[:m_2]
keyword = []
for (c,word) in tem:
keyword.append(word)
time3 = time.time()
print('Keywords:')
for word in keyword:
print(word)
print('KNP run time: %f s' %(time2-time1))
print('keygraph run time: %f s' %(time3-time2))
G = nx.Graph(keygraph)
for (word1,word2) in links:
G.edge[word1][word2]['weight'] = 1
for (word1,word2) in new_links:
G.edge[word1][word2]['weight'] = 0.1
#nx.draw_networkx(G)
elarge=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight'] >0.5]
esmall=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight'] <=0.5]
pos=nx.spring_layout(G)
nx.draw_networkx_nodes(G,pos,node_size=700)
nx.draw_networkx_edges(G,pos,edgelist=elarge,width=6)
nx.draw_networkx_edges(G,pos,edgelist=esmall,width=6,alpha=0.5,edge_color='b',style='dashed')
nx.draw_networkx_labels(G,pos,font_size=20,font_family='sans-serif')
plt.axis('off')
plt.show()