-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtext-analysis.py
More file actions
98 lines (74 loc) · 3.6 KB
/
text-analysis.py
File metadata and controls
98 lines (74 loc) · 3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from nlpkit import tfidf_model, tf_model
from nlpkit.model_loader import cached_or_train
from nlpkit.prepare import prepare_line
from nlpkit.tokenizer import tokenize, tokens_to_texts, without_art_prep_conj_words, without_num_sym_words, \
allow_noun_adj_adp, allow_noun_adj, lemmatizing, doc_pos_tokens_with_texts
import pandas as pd
import scipy.sparse
import numpy as np
from nlpkit.tsne import tsne_representation
from nlpkit.utils import do_or_load_pickle, create_folder_if_not_exists, dump_json
def basic_tokenize_doc(input_doc):
doc = tokenize(input_doc)
doc = allow_noun_adj(doc)
tokens_text = lemmatizing(tokens_to_texts(doc))
doc = doc_pos_tokens_with_texts(doc, tokens_text)
doc = [token for token in doc if len(token.text) > 2]
return doc
def prepare_and_tokenize(raw_document):
return basic_tokenize_doc(prepare_line(raw_document))
def prepare_viz_data(tf_dump, metas, contents):
trained_array = tf_dump["transformations"].toarray()
corpus_where = np.where(trained_array > 0)
docs_where = corpus_where[0]
words_where = corpus_where[1]
vocab = tf_dump["features"]
vocab_counts = {w: [] for w in vocab}
for i, word_where in enumerate(words_where):
doc_where = docs_where[i]
word_name = vocab[word_where]
count_where = trained_array[doc_where][word_where]
w_occurences = vocab_counts[word_name]
w_occurences.append((int(doc_where), float(count_where)))
base_folder = "./output/"
create_folder_if_not_exists(base_folder)
dump_json(vocab_counts, base_folder + "vocab_stats.json")
metas_splitted = metas.to_dict(orient='split')
metas_as_dict = {column_name: [values[i] for values in metas_splitted['data']]
for i, column_name in enumerate(metas_splitted['columns'])}
dump_json(metas_as_dict, base_folder + "metas.json")
contents.to_json(base_folder + "contents.json", orient='values')
def review_sentiment(row):
if row['review_stars'] > 2:
return '+'
else:
return '-'
def main():
df = pd.read_csv("./input/caesars_palace_reviews.csv")
# df = df[:1000]
reviews_text = df["text"]
df['review_sentiment'] = df.apply(lambda row: review_sentiment(row), axis=1)
tokenized_reviews = do_or_load_pickle("tokenized_reviews",
lambda: [prepare_and_tokenize(raw_doc) for raw_doc in reviews_text],
should_overwrite=True)
trained_model = cached_or_train(tf_model.train_model,
lambda: [tokens_to_texts(doc) for doc in tokenized_reviews],
'tf', should_overwrite=True)
trained_model_transformations = trained_model["transformations"]
if isinstance(trained_model_transformations, scipy.sparse.csr.csr_matrix):
transforms = trained_model_transformations.toarray()
else:
transforms = trained_model_transformations
perplexity = 35.
metric = 'cosine'
tsne_coords = do_or_load_pickle("tf_tsne",
lambda: tsne_representation(transforms, metric=metric, perplexity=perplexity),
should_overwrite=True)
[tsne_coords_x, tsne_coords_y] = tsne_coords
tsne_viz_data_values = [{"X": float(x), "Y": float(tsne_coords_y[i])} for i, x in enumerate(tsne_coords_x)]
tsne_viz_data = {"data": {"values": tsne_viz_data_values}}
dump_json(tsne_viz_data, "./output/tf/tsne.json")
prepare_viz_data(trained_model, df[["review_sentiment", "review_stars"]], reviews_text)
return None
if __name__ == '__main__':
main()