forked from OIM3640/Text-Analysis-Project
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathword_frequency.py
More file actions
58 lines (46 loc) · 2.54 KB
/
word_frequency.py
File metadata and controls
58 lines (46 loc) · 2.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import re
from collections import Counter # count how # times each words appear.
import pickle
# the practice goal is: get more familiar with frequency analyzation. The aim is to analyze the frequency of words in book A Christmas Carol by Charles Dickens stored in 'book_text.txt' file, and excluding words listed in 'stop_words.txt'. I tried to use 'nltk.download('stopwords')' first (guided by ChatGpt),
# but I found that there are certain words I want to exclude, such as 'said', 'one', 'upon', so I made a txt file to add all stop words source from "sebleier/NLTK's list of english stopwords", and then I added the certain stop words from the orginal frequncy list that I wanted to exclude. The new frequency list is in the new file name: refined_word_frequencies.txt. This way, the words from the list are more meaningful.
def load_book_text(pickle_file_path):
"""
load book text from pickle file
"""
with open(pickle_file_path, "rb") as file:
book_text = pickle.load(file).lower()
return book_text
def clean_count_words(book_text, stop_words_list):
"""
remove stop words and count the words frequencies.
"""
words = re.findall(r"\w+", book_text) # Extract words
filtered_words = [word for word in words if word not in stop_words_list]
return Counter(filtered_words)
def perform_word_frequency_analysis(
book_pickle_path, frequency_output_path, stop_words_file_path
):
"""Main function to perform word frequency analysis and save the results to a file."""
# Load stop words directly here
with open(stop_words_file_path, "r", encoding="utf-8") as file:
stop_words_list = (
file.read().strip().split()
) # I asked CHATgpt how to split and read through each line in text
book_text = load_book_text(book_pickle_path)
word_frequencies = clean_count_words(book_text, stop_words_list)
# Save/write the word, word frequencies to a file
with open(
frequency_output_path, "w", encoding="utf-8"
) as output_file: # 'w' simply means write
sorted_word_frequencies = sorted(
word_frequencies.items(), key=lambda item: item[1], reverse=True
) # glab we learned lambda
for word, frequency in sorted_word_frequencies:
output_file.write(f"{word}: {frequency}\n")
if __name__ == "__main__":
perform_word_frequency_analysis(
"book_text.pkl", "refined_word_frequencies.txt", "stop_words.txt"
)
print(
"Word frequency analysis completed and results are saved to refined_word_frequencies.txt."
)