-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtopics.py
More file actions
260 lines (214 loc) · 9.23 KB
/
topics.py
File metadata and controls
260 lines (214 loc) · 9.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# -*- coding: utf-8 -*-
import streamlit as st
import gensim as gs
from gensim import models
from gensim.models.coherencemodel import CoherenceModel
# from gensim.models import ldamulticore
from gensim.corpora import Dictionary
import pandas as pd
import numpy as np
from scipy.optimize import linear_sum_assignment
import math
from io import StringIO
from re import sub
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import MWETokenizer
"""
Corpus of documents.
"""
class Corpus:
def __init__(self, documents):
# TODO: do I still need to check for that, or is unicode handled fine now?
self.documents = self.to_ascii(documents)
def to_ascii(self, documents):
# replace non-ascii symbols left by text processing software
documents['content'] = [sub(r'[^A-Za-z0-9,\.?!]+', ' ', document)
for document in documents['content']]
return documents
def preprocess(self, user_defined_stopwords, multiwords):
self.stopwords_en = self.read_stopwords("stopwords-en.txt")
self.user_defined_stopwords = user_defined_stopwords.split('\n')
self.user_defined_stopwords = [word.strip() for word in self.user_defined_stopwords]
self.stopwords = self.stopwords_en + self.user_defined_stopwords
self.tokenizer = self.create_tokenizer(multiwords)
self.tokens = [[word for word in self.tokenizer.tokenize([self.lemmatize(word) for word in self.tokenize(document)])
if word not in self.stopwords]
for document in self.documents['content']]
# self.tokens = [word for word in [[self.lemmatize(word) for word in self.tokenize(document)]
# for document in self.documents['content']]]
# self.tokens = [tokenizer.tokenize(word_list) for word_list in self.tokens]
# self.tokens = [word for word in word_list if word not in self.stopwords]
self.dictionary = Dictionary(self.tokens)
def preprocess_document(self, document):
return [word for word in self.tokenizer.tokenize([self.lemmatize(word) for word in self.tokenize(document)])
if word not in self.stopwords]
def read_stopwords(self, file):
file = open(file, 'r')
return file.read().split('\n')
def create_tokenizer(self, multiwords):
tokenizer = MWETokenizer()
for mwe in tokenizer.tokenize(multiwords.split('\n')):
tokenizer.add_mwe(mwe.split(' '))
return tokenizer
def tokenize(self, document):
return sub(r'[^A-Za-z0-9]+', ' ', document).lower().split()
def lemmatize(self, word):
lemmatizer = WordNetLemmatizer()
return lemmatizer.lemmatize(word)
def bow(self):
return [self.dictionary.doc2bow(doc) for doc in self.tokens]
def get_document_bow(self, document):
document_tokens = self.preprocess_document(document)
return self.dictionary.doc2bow(document_tokens)
def average_document_length(self):
return np.mean(map(len, self.tokens))
"""
Topic model of a corpus.
"""
class TopicModel:
def gensim_version(self):
return gs.__version__
def load_corpus(self, url, stopwords, multiwords):
if url is not None:
url.seek(0) # move read head back to the start (StringIO behaves like a file)
documents = pd.read_csv(url)
if ('name' not in documents or 'content' not in documents):
return None
corpus = Corpus(documents)
corpus.preprocess(stopwords, multiwords)
return corpus
else:
return None
def fit(self, corpus, number_of_topics, number_of_iterations=50, number_of_passes=1,
number_of_chunks=1, random_seed=None, alpha="symmetric"):
if alpha == "talley":
alpha = np.array([self.alpha(corpus, number_of_topics)] * number_of_topics)
# Added random_state for reproducibility (the default is to choose a random seed)
return LDA(models.LdaModel(corpus.bow(), number_of_topics, corpus.dictionary,
iterations=number_of_iterations, passes=number_of_passes, random_state=random_seed,
chunksize=self.chunksize(corpus, number_of_chunks), alpha=alpha))
def alpha(self, corpus, number_of_topics):
return 0.05 * corpus.average_document_length() / number_of_topics
def chunksize(self, corpus, number_of_chunks):
return math.ceil(len(corpus.documents) / number_of_chunks)
class LDA:
def __init__(self, lda):
self.lda = lda
def number_of_topics(self):
return self.lda.num_topics
def chunksize(self):
return self.lda.chunksize
def show_topics(self, number_of_topics, number_of_words):
return self.lda.show_topics(num_topics=number_of_topics,
num_words=number_of_words, formatted=False)
def get_document_topics(self, document_bow):
return self.lda.get_document_topics(document_bow)
def coherence(self, corpus):
coherence_model = CoherenceModel(model=self.lda, texts=corpus.tokens,
dictionary=corpus.dictionary, coherence='c_uci')
return coherence_model.get_coherence()
# return a difference matrix between two topic models
# computes the average jaccard distance as defined by Greene (2014)
def difference(self, other, n=10):
return sum([self.jaccard(other, k) for k in range(n)]) / n
def jaccard(self, other, k):
diff, _ = self.lda.diff(other.lda, distance='jaccard', num_words=k)
return diff
def document_topic_matrix(self, corpus):
dtm = []
for document_bow in corpus.bow():
dtm.append(self.topics_sparse_to_full(self.get_document_topics(document_bow)))
tcid = corpus.dictionary.id2token
return pd.DataFrame(dtm)
def topics_sparse_to_full(self, topics):
topics_full = [0] * self.number_of_topics() # pythonic way of creating a list of zeros
for topic, score in topics:
topics_full[topic] = score
return topics_full
def topic_co_occurrence_matrix(self, dtm, min_weight=0.1):
return [[t for t, w in topics if w >= min_weight] for topics in dtm]
def tcom_to_sentences(self, tcom):
for tco in tcom:
tco = ["T{}".format(t) for t in tco]
tco.append('.')
return " ".join(tco)
def topic_sentences(self, corpus, min_weight=0.1):
dtm = [self.get_document_topics(bow) for bow in corpus.bow()]
tcom = self.topic_co_occurrence_matrix(dtm, min_weight)
return self.tcom_to_sentences(tcom)
class TopicAlignment:
def __init__(self, topic_model, corpus, number_of_topics, number_of_chunks, number_of_runs, random_seed=None):
self.topic_model = topic_model
self.corpus = corpus
self.number_of_topics = number_of_topics
self.number_of_chunks = number_of_chunks
self.number_of_runs = number_of_runs
self.random_seed = random_seed
def fit(self, progress_update):
# experimental: remember the computed LDA models
# don't store the computed LDA models; in this way, they don't get
# included in the hash streamlit uses to cache results
self.lda_models = self.lda_model_runs(progress_update)
# determine the matching topics across the different runs
self.matches = self.matches(self.lda_models)
# find the top topic keywords for each topic and each run
self.topics = self.topics(self.lda_models)
# collect the keywords and associated weights for each topic across all topic models
self.keywords, self.weights = self.keywords_with_weights(self.lda_models)
# find the topics for each document
self.dtm, self.documents = self.documents(self.lda_models)
# create a group of topic models with the same number of topics
def lda_model_runs(self, progress_update):
lda_models = []
for run in range(self.number_of_runs):
lda_models.append(self.topic_model.fit(self.corpus, self.number_of_topics,
number_of_chunks=self.number_of_chunks, random_seed=self.random_seed))
progress_update(run)
return lda_models
# extract the topic words for each topic in all topic models
def topics(self, lda_models):
return pd.DataFrame([[" ".join([tw[0] for tw in lda_model.lda.show_topic(t, 10)])
for lda_model in lda_models] for t in range(self.number_of_topics)])
# compute the average Jaccard distance between the topic models
def differences(self, lda_models):
return [lda_models[0].difference(lda_models[i])
for i in range(1, len(lda_models))]
# fit topics between the first and each of the remaining topic models using
# the Hungarian linear assignment method
def matches(self, lda_models):
diffs = self.differences(lda_models)
matches = pd.DataFrame()
# first column are the topics of the first topic model
matches[0] = range(self.number_of_topics)
# minimize the total misalignment between topics
for i in range(1, self.number_of_runs):
_, cols = linear_sum_assignment(diffs[i-1])
# each column contains the topics that align with the topics of the first topic
matches[i] = cols
return matches
def keywords_with_weights(self, lda_models):
keywords, weights = [], []
for topic in range(self.number_of_topics):
keywords_for_topic = pd.DataFrame()
weights_for_topic = pd.DataFrame()
for i in range(self.number_of_runs):
keywords_for_topic[i] = [tw[0] for tw
in lda_models[i].lda.show_topic(self.matches[i][topic], 10)]
weights_for_topic[i] = [tw[1] for tw
in lda_models[i].lda.show_topic(self.matches[i][topic], 10)]
keywords.append(keywords_for_topic)
weights.append(weights_for_topic)
return keywords, weights
def documents(self, lda_models):
dtm = [lda_models[i].document_topic_matrix(self.corpus)
for i in range(self.number_of_runs)]
documents = []
for topic in range(self.number_of_topics):
documents_for_topic = pd.DataFrame()
for i in range(self.number_of_runs):
documents_for_topic[i] = dtm[i][self.matches[i][topic]]
documents.append(documents_for_topic)
return dtm, documents
# initialize
nltk.download('wordnet')