forked from OIM3640/Text-Analysis-Project
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathproject.py
More file actions
161 lines (135 loc) · 4.65 KB
/
project.py
File metadata and controls
161 lines (135 loc) · 4.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import urllib.request
import string
from api import STOPWORDS, extract_entities_with_openai, tally_entities
# source texts
ILIAD_URL = "https://www.gutenberg.org/cache/epub/6130/pg6130.txt"
ODYSSEY_URL = "https://www.gutenberg.org/cache/epub/1727/pg1727.txt"
# chunk settings for OpenAI API
CHUNK_SIZE = 1200
MAX_API_CHUNKS = 2
# download the texts from project gutenberg
def download_text(url):
try:
with urllib.request.urlopen(url) as f:
text = f.read().decode('utf-8')
return text
except Exception as e:
print("An error occurred:", e)
# strip texts of headers
def strip_gutenberg(text):
start_token = "*** START OF"
end_token = "*** END OF"
s = text.find(start_token)
e = text.find(end_token)
if s != -1 and e != -1:
return text[s:e]
return text
def clean_and_split(text):
# convert to lowercase
text = text.lower()
# remove basic punctuation
for p in string.punctuation:
text = text.replace(p, " ")
# split to words
words = text.split()
return words
# filter text to remove stopwords
def remove_stopwords(words):
new_words = []
for w in words:
if w not in STOPWORDS:
new_words.append(w)
return new_words
def count_words(words):
counts = {}
for w in words:
if w in counts:
counts[w] = counts[w] + 1
else:
counts[w] = 1
return counts
def top_words(counts, n=15):
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
return items[:n]
def ascii_bar_chart(pairs, title):
"""
pairs: list of (word, count)
prints a simple bar chart
"""
print("\n" + title)
if not pairs:
print("no data")
return
max_val = pairs[0][1]
for word, count in pairs:
# scale bar to 30 chars
bar_len = int(count / max_val * 30)
bar = "#" * bar_len
print(f"{word:15} | {bar}")
def chunks_from_words(words, chunk_size):
for i in range(0, len(words), chunk_size):
yield words[i:i + chunk_size]
def analyze_work(name, url):
print(f"--- downloading {name} ---")
text = download_text(url)
text = strip_gutenberg(text)
# 1. basic text analysis
words = clean_and_split(text)
filtered_words = remove_stopwords(words)
counts = count_words(filtered_words)
top15 = top_words(counts, 15)
# visualization requirement: show a bar chart of top words
ascii_bar_chart(top15, f"Top words in {name}")
# 2. advanced: OpenAI entity extraction
print(f"\nExtracting entities from {name} with OpenAI (up to {MAX_API_CHUNKS} chunks)...")
all_entities = []
original_words = text.split()
chunk_number = 0
for chunk_words in chunks_from_words(original_words, CHUNK_SIZE):
if chunk_number >= MAX_API_CHUNKS:
break
passage = " ".join(chunk_words)
try:
data = extract_entities_with_openai(passage)
all_entities.append(data)
except Exception as e:
print("OpenAI error:", e)
break
chunk_number = chunk_number + 1
if all_entities:
char_counts, place_counts, god_counts = tally_entities(all_entities)
else:
char_counts, place_counts, god_counts = {}, {}, {}
print(f"\n(Partial) most mentioned characters in {name}:")
for word, cnt in sorted(char_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
print(word, cnt)
print(f"\n(Partial) most mentioned places in {name}:")
for word, cnt in sorted(place_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
print(word, cnt)
print(f"\n(Partial) most mentioned gods in {name}:")
for word, cnt in sorted(god_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
print(word, cnt)
return {
"word_counts": counts,
"entities": {
"characters": char_counts,
"places": place_counts,
"gods": god_counts
}
}
def main():
iliad_result = analyze_work("Iliad", ILIAD_URL)
odyssey_result = analyze_work("Odyssey", ODYSSEY_URL)
# compare if we got entities
print("\n=== Comparison (partial) ===")
if iliad_result["entities"]["characters"] and odyssey_result["entities"]["characters"]:
print("\nCharacters mentioned more in Iliad than Odyssey:")
for name, cnt in iliad_result["entities"]["characters"].items():
other = odyssey_result["entities"]["characters"].get(name, 0)
if cnt > other:
print(f"{name}: Iliad {cnt}, Odyssey {other}")
else:
print("Skipped comparison because no entities were extracted.")
if __name__ == "__main__":
main()