topic-model-explorer/topics.py at master · michaelweiss/topic-model-explorer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# -*- coding: utf-8 -*-

import streamlit as st

import gensim as gs

from gensim import models
from gensim.models.coherencemodel import CoherenceModel
# from gensim.models import ldamulticore
from gensim.corpora import Dictionary

import pandas as pd
import numpy as np
from scipy.optimize import linear_sum_assignment
import math

from io import StringIO
from re import sub

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import MWETokenizer

"""
Corpus of documents.
"""
class Corpus:
	def __init__(self, documents):
		# TODO: do I still need to check for that, or is unicode handled fine now?
		self.documents = self.to_ascii(documents)

	def to_ascii(self, documents):
		# replace non-ascii symbols left by text processing software
		documents['content'] = [sub(r'[^A-Za-z0-9,\.?!]+', ' ', document)
			for document in documents['content']]
		return documents

	def preprocess(self, user_defined_stopwords, multiwords):
		self.stopwords_en = self.read_stopwords("stopwords-en.txt")
		self.user_defined_stopwords = user_defined_stopwords.split('\n')
		self.user_defined_stopwords = [word.strip() for word in self.user_defined_stopwords]
		self.stopwords = self.stopwords_en + self.user_defined_stopwords
		self.tokenizer = self.create_tokenizer(multiwords)
		self.tokens = [[word for word in self.tokenizer.tokenize([self.lemmatize(word) for word in self.tokenize(document)])
				if word not in self.stopwords]
			for document in self.documents['content']]
		# self.tokens = [word for word in [[self.lemmatize(word) for word in self.tokenize(document)]
		# 	for document in self.documents['content']]]
		# self.tokens = [tokenizer.tokenize(word_list) for word_list in self.tokens]
		# self.tokens = [word for word in word_list if word not in self.stopwords]
		self.dictionary = Dictionary(self.tokens)

	def preprocess_document(self, document):
		return [word for word in self.tokenizer.tokenize([self.lemmatize(word) for word in self.tokenize(document)])
			if word not in self.stopwords]

	def read_stopwords(self, file):
		file = open(file, 'r')
		return file.read().split('\n')

	def create_tokenizer(self, multiwords):
		tokenizer = MWETokenizer()
		for mwe in tokenizer.tokenize(multiwords.split('\n')):
			tokenizer.add_mwe(mwe.split(' '))
		return tokenizer

	def tokenize(self, document):
		return sub(r'[^A-Za-z0-9]+', ' ', document).lower().split()

	def lemmatize(self, word):
		lemmatizer = WordNetLemmatizer()
		return lemmatizer.lemmatize(word)

	def bow(self):
		return [self.dictionary.doc2bow(doc) for doc in self.tokens]

	def get_document_bow(self, document):
		document_tokens = self.preprocess_document(document)
		return self.dictionary.doc2bow(document_tokens)

	def average_document_length(self):
		return np.mean(map(len, self.tokens))

"""
Topic model of a corpus.
"""
class TopicModel:
	def gensim_version(self):
		return gs.__version__

	def load_corpus(self, url, stopwords, multiwords):
		if url is not None:
			url.seek(0)	 # move read head back to the start (StringIO behaves like a file)
			documents = pd.read_csv(url)
			if ('name' not in documents or 'content' not in documents):
				return None
			corpus = Corpus(documents)
			corpus.preprocess(stopwords, multiwords)
			return corpus
		else:
			return None

	def fit(self, corpus, number_of_topics, number_of_iterations=50, number_of_passes=1,
			number_of_chunks=1, random_seed=None, alpha="symmetric"):
		if alpha == "talley":
			alpha = np.array([self.alpha(corpus, number_of_topics)] * number_of_topics)
		# Added random_state for reproducibility (the default is to choose a random seed)
		return LDA(models.LdaModel(corpus.bow(), number_of_topics, corpus.dictionary,
			iterations=number_of_iterations, passes=number_of_passes, random_state=random_seed,
			chunksize=self.chunksize(corpus, number_of_chunks), alpha=alpha))

	def alpha(self, corpus, number_of_topics):
		return 0.05 * corpus.average_document_length() / number_of_topics

	def chunksize(self, corpus, number_of_chunks):
		return math.ceil(len(corpus.documents) / number_of_chunks)

class LDA:
	def __init__(self, lda):
		self.lda = lda

	def number_of_topics(self):
		return self.lda.num_topics

	def chunksize(self):
		return self.lda.chunksize

	def show_topics(self, number_of_topics, number_of_words):
		return self.lda.show_topics(num_topics=number_of_topics,
			num_words=number_of_words, formatted=False)

	def get_document_topics(self, document_bow):
		return self.lda.get_document_topics(document_bow)

	def coherence(self, corpus):
		coherence_model = CoherenceModel(model=self.lda, texts=corpus.tokens,
			dictionary=corpus.dictionary, coherence='c_uci')
		return coherence_model.get_coherence()

	# return a difference matrix between two topic models
	# computes the average jaccard distance as defined by Greene (2014)
	def difference(self, other, n=10):
		return sum([self.jaccard(other, k) for k in range(n)]) / n

	def jaccard(self, other, k):
		diff, _ = self.lda.diff(other.lda, distance='jaccard', num_words=k)
		return diff

	def document_topic_matrix(self, corpus):
		dtm = []
		for document_bow in corpus.bow():
			dtm.append(self.topics_sparse_to_full(self.get_document_topics(document_bow)))
			tcid = corpus.dictionary.id2token
		return pd.DataFrame(dtm)

	def topics_sparse_to_full(self, topics):
		topics_full = [0] * self.number_of_topics()  # pythonic way of creating a list of zeros
		for topic, score in topics:
			topics_full[topic] = score
		return topics_full

	def topic_co_occurrence_matrix(self, dtm, min_weight=0.1):
		return [[t for t, w in topics if w >= min_weight] for topics in dtm]

	def tcom_to_sentences(self, tcom):
		for tco in tcom:
			tco = ["T{}".format(t) for t in tco]
			tco.append('.')
			return " ".join(tco)

	def topic_sentences(self, corpus, min_weight=0.1):
		dtm = [self.get_document_topics(bow) for bow in corpus.bow()]
		tcom = self.topic_co_occurrence_matrix(dtm, min_weight)
		return self.tcom_to_sentences(tcom)

class TopicAlignment:
	def __init__(self, topic_model, corpus, number_of_topics, number_of_chunks, number_of_runs, random_seed=None):
		self.topic_model = topic_model
		self.corpus = corpus
		self.number_of_topics = number_of_topics
		self.number_of_chunks = number_of_chunks
		self.number_of_runs = number_of_runs
		self.random_seed = random_seed

	def fit(self, progress_update):
		# experimental: remember the computed LDA models
		# don't store the computed LDA models; in this way, they don't get
		# included in the hash streamlit uses to cache results
		self.lda_models = self.lda_model_runs(progress_update)
		# determine the matching topics across the different runs
		self.matches = self.matches(self.lda_models)
		# find the top topic keywords for each topic and each run
		self.topics = self.topics(self.lda_models)
		# collect the keywords and associated weights for each topic across all topic models
		self.keywords, self.weights = self.keywords_with_weights(self.lda_models)
		# find the topics for each document
		self.dtm, self.documents = self.documents(self.lda_models)

	# create a group of topic models with the same number of topics
	def lda_model_runs(self, progress_update):
		lda_models = []
		for run in range(self.number_of_runs):
			lda_models.append(self.topic_model.fit(self.corpus, self.number_of_topics,
				number_of_chunks=self.number_of_chunks, random_seed=self.random_seed))
			progress_update(run)
		return lda_models

	# extract the topic words for each topic in all topic models
	def topics(self, lda_models):
		return pd.DataFrame([[" ".join([tw[0] for tw in lda_model.lda.show_topic(t, 10)])
			for lda_model in lda_models] for t in range(self.number_of_topics)])

	# compute the average Jaccard distance between the topic models
	def differences(self, lda_models):
		return [lda_models[0].difference(lda_models[i])
			for i in range(1, len(lda_models))]

	# fit topics between the first and each of the remaining topic models using
	# the Hungarian linear assignment method
	def matches(self, lda_models):
		diffs = self.differences(lda_models)
		matches = pd.DataFrame()
		# first column are the topics of the first topic model
		matches[0] = range(self.number_of_topics)
		# minimize the total misalignment between topics
		for i in range(1, self.number_of_runs):
			_, cols = linear_sum_assignment(diffs[i-1])
			# each column contains the topics that align with the topics of the first topic
			matches[i] = cols
		return matches

	def keywords_with_weights(self, lda_models):
		keywords, weights = [], []
		for topic in range(self.number_of_topics):
			keywords_for_topic = pd.DataFrame()
			weights_for_topic = pd.DataFrame()
			for i in range(self.number_of_runs):
				keywords_for_topic[i] = [tw[0] for tw
					in lda_models[i].lda.show_topic(self.matches[i][topic], 10)]
				weights_for_topic[i] = [tw[1] for tw
					in lda_models[i].lda.show_topic(self.matches[i][topic], 10)]
			keywords.append(keywords_for_topic)
			weights.append(weights_for_topic)
		return keywords, weights

	def documents(self, lda_models):
		dtm = [lda_models[i].document_topic_matrix(self.corpus)
			for i in range(self.number_of_runs)]
		documents = []
		for topic in range(self.number_of_topics):
			documents_for_topic = pd.DataFrame()
			for i in range(self.number_of_runs):
				documents_for_topic[i] = dtm[i][self.matches[i][topic]]
			documents.append(documents_for_topic)
		return dtm, documents

# initialize

nltk.download('wordnet')