-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLDA_Inference.py
More file actions
executable file
·94 lines (85 loc) · 4.01 KB
/
LDA_Inference.py
File metadata and controls
executable file
·94 lines (85 loc) · 4.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from gensim import corpora, models
import pickle, re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
import json
import pandas as pd
NUM_GRAMS = 2
additional_stop_words=['hrtechconf','peopleanalytics','hrtech','hr','hrconfes',
'hrtechnology','voiceofhr','hrtechadvisor','gen','wait',
'next','see','hcm','booth','tech','la','vega','last',
'look','technology','work', 'announce','product','new',
'team','use','happen','time','take','make','everyone',
'anyone','week','day','year','let','go','come','word',
'employee','get','people','today','session','need',
'meet','help','talk','join','start','awesome','great',
'achieve','job','tonight','everyday','room','ready',
'one','company','say','well','data','share','love',
'want','like','good','business','sure','miss','demo',
'live','min','play','always','would','way','almost',
'thank','still','many','much','info','wow','play','full',
'org','create','leave','back','front','first','may',
'tomorrow','yesterday','find','stay','add','conference',
'top','stop','expo','hall','detail','row','award','hey',
'continue','put','part','whole','some','any','everywhere',
'convention','center','forget','congratulation','every',
'agenda','gift','card','available','behind','meeting',
'best','happen','unlockpotentialpic','half','none',
'human', 'resources','truly','win','possible','thanks',
'know','check','visit','fun','give','think','forward',
'twitter','com','pic','rt','via']
def read_data_from_pickle(infile):
with open (infile, 'rb') as fp:
return pickle.load(fp)
def get_wordnet_pos(word):
"""
Map POS tag to first character lemmatize() accepts
"""
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
def text_cleanup(text):
'''
Text pre-processing
return tokenized list of cleaned words
'''
# Convert to lowercase
text = str(text)
text_clean = text.lower()
# Remove non-alphabet
text_clean = re.sub(r'[^a-zA-Z]|(\w+:\/\/\S+)',' ', text_clean).split()
# Remove short words (length < 3)
text_clean = [w for w in text_clean if len(w)>2]
# Lemmatize text with the appropriate POS tag
lemmatizer = WordNetLemmatizer()
text_clean = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text_clean]
# Filter out stop words in English
stops = set(stopwords.words('english')).union(additional_stop_words)
text_clean = [w for w in text_clean if w not in stops]
return text_clean
def word_grams(words, min=1, max=2):
'''
Build ngrams word list
'''
word_list = []
for n in range(min, max):
for ngram in ngrams(words, n):
word_list.append(' '.join(str(i) for i in ngram))
return word_list
def inference(article):
tokenized = text_cleanup(article)
ngram = word_grams(tokenized, NUM_GRAMS, NUM_GRAMS+1)
corpus_test = tweets_dict.doc2bow(ngram)
result = lda_model.get_document_topics(corpus_test)
topic_names = [lda_model.show_topic(x[0], 1)[0][0] for x in result]
sorted_result = sorted(result, key = lambda x:x[1], reverse=True)
return sorted_result, topic_names
lda_model = models.ldamodel.LdaModel.load('./models/tweets_lda.model')
cleaned_tweets_ngram = read_data_from_pickle('./models/cleaned_tweets_ngrams')
tweets_dict = corpora.Dictionary(cleaned_tweets_ngram)
tweets_dict.filter_extremes(no_below=10, no_above=0.5)