From a6fed2a72ead51f60549a4ee0fac5027005ea4ac Mon Sep 17 00:00:00 2001 From: Diego Aldas Date: Wed, 5 Nov 2025 10:34:42 -0500 Subject: [PATCH 1/5] started working on readme --- .vscode/settings.json | 3 ++ Project_folder/analyzer.py | 83 +++++++++++++++++++++++++++++++++++ Project_folder/main.py | 66 ++++++++++++++++++++++++++++ Project_folder/text_loader.py | 25 +++++++++++ Project_folder/visualizer.py | 20 +++++++++ README.md | 11 ++++- 6 files changed, 206 insertions(+), 2 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 Project_folder/analyzer.py create mode 100644 Project_folder/main.py create mode 100644 Project_folder/text_loader.py create mode 100644 Project_folder/visualizer.py diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..642ff51 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.REPL.enableREPLSmartSend": false +} \ No newline at end of file diff --git a/Project_folder/analyzer.py b/Project_folder/analyzer.py new file mode 100644 index 0000000..59229c7 --- /dev/null +++ b/Project_folder/analyzer.py @@ -0,0 +1,83 @@ +def remove_stop_words(text): + """Remove common stop words from text""" + stop_words = { + 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', + 'of', 'with', 'is', 'was', 'are', 'were', 'been', 'be', 'have', 'has', + 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', + 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', + 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who', 'when', 'where', + 'why', 'how', 'all', 'each', 'every', 'both', 'few', 'more', 'most', + 'other', 'some', 'such', 'no', 'not', 'only', 'own', 'same', 'so', + 'than', 'too', 'very', 'as', 'from', 'by', 'his', 'her', 'its', 'their', + 'them', 'there', 'here', 'then', 'now', 'if', 'into', 'out', 'up', 'down', + 'about', 'said', 'upon' + } + + words = text.split() + filtered_words = [] + + for word in words: + if word.lower() not in stop_words: + filtered_words.append(word) + + return filtered_words + +def word_frequency(words): + """Count frequency of each word""" + freq = {} + for word in words: + if word in freq: + freq[word] = freq[word] + 1 + else: + freq[word] = 1 + return freq + +def get_top_words(freq_dict, n=20): + """Get top N most frequent words""" + + word_list = [] + for word, count in freq_dict.items(): + word_list.append((word, count)) + + # Sort by count (highest first) + for i in range(len(word_list)): + for j in range(len(word_list) - 1): + if word_list[j][1] < word_list[j + 1][1]: + + temp = word_list[j] + word_list[j] = word_list[j + 1] + word_list[j + 1] = temp + + + return word_list[:n] + +def text_similarity(text1, text2): + """Compare similarity between two texts using Levenshtein Distance""" + try: + from thefuzz import fuzz + + + token_sort_ratio = fuzz.token_sort_ratio(text1, text2) + + return { + 'token_sort_ratio': token_sort_ratio + } + except ImportError: + print("thefuzz not installed. Run: pip install thefuzz") + return None + +def compare_book_sections(text, section_length=500): + """Compare different sections of the same book""" + # Split text into sections + words = text.split() + total_words = len(words) + + + section_size = min(section_length, total_words // 3) + + beginning = ' '.join(words[:section_size]) + middle_start = (total_words // 2) - (section_size // 2) + middle = ' '.join(words[middle_start:middle_start + section_size]) + end = ' '.join(words[-section_size:]) + + return beginning, middle, end \ No newline at end of file diff --git a/Project_folder/main.py b/Project_folder/main.py new file mode 100644 index 0000000..98c2257 --- /dev/null +++ b/Project_folder/main.py @@ -0,0 +1,66 @@ +from text_loader import load_text +from analyzer import remove_stop_words, word_frequency, get_top_words, text_similarity, compare_book_sections +from visualizer import create_word_frequency_chart + +def main(): + url = "https://www.gutenberg.org/cache/epub/730/pg730.txt" + + + print("Downloading and cleaning text...") + text = load_text(url) + + + print("\nFirst 500 characters:") + print(text[:500]) + + + total_words = len(text.split()) + print(f"\n\nTotal word count: {total_words}") + + + filtered_words = remove_stop_words(text) + print(f"Word count after removing stop words: {len(filtered_words)}") + + + freq = word_frequency(filtered_words) + + + top_words = get_top_words(freq, 20) + + print("\nTop 20 most frequent words:") + for word, count in top_words: + print(f"{word}: {count}") + + + create_word_frequency_chart(top_words) + + # TEXT SIMILARITY comparisons + print("\n" + "="*60) + print("Text Similarity Analysis") + print("="*60) + print("Comparing different sections of Oliver Twist...") + + beginning, middle, end = compare_book_sections(text) + + print("\nComparing Beginning vs Middle:") + similarity1 = text_similarity(beginning, middle) + if similarity1: + print(f" Similarity Score: {similarity1['token_sort_ratio']}%") + + print("\nComparing Beginning vs End:") + similarity2 = text_similarity(beginning, end) + if similarity2: + print(f" Similarity Score: {similarity2['token_sort_ratio']}%") + + print("\nComparing Middle vs End:") + similarity3 = text_similarity(middle, end) + if similarity3: + print(f" Similarity Score: {similarity3['token_sort_ratio']}%") + + print("\nInterpretation:") + print("- Higher percentages mean more similar text") + print("- This shows how writing style/vocabulary changes throughout the book") + print("="*60) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Project_folder/text_loader.py b/Project_folder/text_loader.py new file mode 100644 index 0000000..3fc6a6d --- /dev/null +++ b/Project_folder/text_loader.py @@ -0,0 +1,25 @@ +import urllib.request +import re + +def load_text(url): + try: + with urllib.request.urlopen(url) as f: + text = f.read().decode('utf-8') + except Exception as e: + print("Error downloading text:", e) + return "" + + + start_match = re.search(r"\*\*\* START OF.*?\*\*\*", text) + end_match = re.search(r"\*\*\* END OF.*?\*\*\*", text) + + if start_match and end_match: + start = start_match.end() + end = end_match.start() + text = text[start:end] + + + text = text.lower() + text = re.sub(r'[^a-z\s]', '', text) + + return text \ No newline at end of file diff --git a/Project_folder/visualizer.py b/Project_folder/visualizer.py new file mode 100644 index 0000000..41da6fc --- /dev/null +++ b/Project_folder/visualizer.py @@ -0,0 +1,20 @@ +def create_word_frequency_chart(top_words): + """Create a simple ASCII bar chart of word frequencies""" + print("\n" + "="*60) + print("Word Frequency Visualization") + print("="*60) + + if not top_words: + print("No data to visualize") + return + + max_count = top_words[0][1] + max_bar_length = 50 + + for word, count in top_words: + + bar_length = int((count / max_count) * max_bar_length) + bar = '█' * bar_length + print(f"{word:15} | {bar} {count}") + + print("="*60) \ No newline at end of file diff --git a/README.md b/README.md index 05aa109..6866e31 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,10 @@ -# Text-Analysis-Project +# Text-Analysis-Project- Oliver Twist -Please read the [instructions](instructions.md). +## Project Overview + +## Implementation + +## Results + +## Reflection +In terms of what went well I liked working with the modular structure, when I split the code into seperate files it made it easier to debug the code and modify individual functions. The word frequency analysis showed interesting insights on on how characters were focused along with its victorian language. My biggest challange was during the text similarity comparison analysis because it first took over 5 minutes to run because I had originally been comparing sections of 5,000 words.I then decided to reduce the section size to 500 words to make it more practical so it would run quicker and you wouldn't have to wait there waiting. I learned how to use regular expressions for cleaning text as well as how to install and use external libraries like TheFuzz. In terms of AI for this project I used Claude AI which helped me troubleshoot import errors , optimize the slow code issue I had at first , and understand how to split up my project onto different modules that make sense. Claude suggested reducing the section size and simplyfying the similarity calculations. I now feel better about working with text data and installing and using external libraries in python. I now know to start with smaller text sections for analysis instead of running large analysis on the text files because if I don't it takes a long time for the code to work so I had to trade off analysis for functionality of the code. From 2e47a86f59c790ac18778cf88d38dec5a3af22cb Mon Sep 17 00:00:00 2001 From: Diego Aldas Date: Wed, 5 Nov 2025 17:31:25 -0500 Subject: [PATCH 2/5] updated --- Project_folder/analyzer.py | 47 ++++++++++++++++++++++++++++++----- Project_folder/main.py | 46 ++++++++++++++++++++++++++-------- Project_folder/text_loader.py | 4 +-- Project_folder/visualizer.py | 6 +++-- 4 files changed, 82 insertions(+), 21 deletions(-) diff --git a/Project_folder/analyzer.py b/Project_folder/analyzer.py index 59229c7..be5527d 100644 --- a/Project_folder/analyzer.py +++ b/Project_folder/analyzer.py @@ -34,7 +34,7 @@ def word_frequency(words): def get_top_words(freq_dict, n=20): """Get top N most frequent words""" - + # Convert dictionary to list of tuples word_list = [] for word, count in freq_dict.items(): word_list.append((word, count)) @@ -43,20 +43,55 @@ def get_top_words(freq_dict, n=20): for i in range(len(word_list)): for j in range(len(word_list) - 1): if word_list[j][1] < word_list[j + 1][1]: - + # Swap temp = word_list[j] word_list[j] = word_list[j + 1] word_list[j + 1] = temp - + # Return top n words return word_list[:n] +def calculate_average_word_length(words): + """Calculate average length of words""" + if not words: + return 0 + total_length = 0 + for word in words: + total_length += len(word) + return total_length / len(words) + +def calculate_vocabulary_richness(words): + """Calculate unique words / total words ratio""" + if not words: + return 0 + unique_words = set(words) + return len(unique_words) / len(words) + +def get_longest_words(words, n=10): + """Get the n longest words""" + # Create list of (word, length) tuples + word_lengths = [] + unique_words = set(words) # Remove duplicates + + for word in unique_words: + word_lengths.append((word, len(word))) + + # Sort by length (longest first) + for i in range(len(word_lengths)): + for j in range(len(word_lengths) - 1): + if word_lengths[j][1] < word_lengths[j + 1][1]: + temp = word_lengths[j] + word_lengths[j] = word_lengths[j + 1] + word_lengths[j + 1] = temp + + return word_lengths[:n] + def text_similarity(text1, text2): """Compare similarity between two texts using Levenshtein Distance""" try: from thefuzz import fuzz - + # Only use the fastest method token_sort_ratio = fuzz.token_sort_ratio(text1, text2) return { @@ -68,11 +103,11 @@ def text_similarity(text1, text2): def compare_book_sections(text, section_length=500): """Compare different sections of the same book""" - # Split text into sections + # Splits text into sections words = text.split() total_words = len(words) - + # Create 3 sections: beginning, middle, end (SMALLER SIZE) section_size = min(section_length, total_words // 3) beginning = ' '.join(words[:section_size]) diff --git a/Project_folder/main.py b/Project_folder/main.py index 98c2257..0b9d835 100644 --- a/Project_folder/main.py +++ b/Project_folder/main.py @@ -1,40 +1,64 @@ from text_loader import load_text -from analyzer import remove_stop_words, word_frequency, get_top_words, text_similarity, compare_book_sections +from analyzer import (remove_stop_words, word_frequency, get_top_words, + calculate_average_word_length, calculate_vocabulary_richness, + get_longest_words, text_similarity, compare_book_sections) from visualizer import create_word_frequency_chart def main(): url = "https://www.gutenberg.org/cache/epub/730/pg730.txt" - + print("Downloading and cleaning text...") text = load_text(url) - + # Display sample print("\nFirst 500 characters:") print(text[:500]) - + # Word count total_words = len(text.split()) print(f"\n\nTotal word count: {total_words}") - + # Remove stop words filtered_words = remove_stop_words(text) print(f"Word count after removing stop words: {len(filtered_words)}") - + # Word frequency analysis freq = word_frequency(filtered_words) + # Get top 10 words + top_words = get_top_words(freq, 10) - top_words = get_top_words(freq, 20) - - print("\nTop 20 most frequent words:") + print("\nTop 10 most frequent words:") for word, count in top_words: print(f"{word}: {count}") - + # SUMMARY STATISTICS + print("\n" + "="*60) + print("Summary Statistics") + print("="*60) + + avg_word_length = calculate_average_word_length(filtered_words) + print(f"Average word length: {avg_word_length:.2f} characters") + + vocab_richness = calculate_vocabulary_richness(filtered_words) + print(f"Vocabulary richness: {vocab_richness:.2%}") + print(" (Higher % = more diverse vocabulary)") + + longest_words = get_longest_words(filtered_words, 10) + print("\nTop 10 longest words:") + for word, length in longest_words: + print(f" {word} ({length} letters)") + + unique_word_count = len(set(filtered_words)) + print(f"\nTotal unique words: {unique_word_count}") + print(f"Word repetition rate: {1 - vocab_richness:.2%}") + print("="*60) + + # VISUALIZATION create_word_frequency_chart(top_words) - # TEXT SIMILARITY comparisons + # TEXT SIMILARITY print("\n" + "="*60) print("Text Similarity Analysis") print("="*60) diff --git a/Project_folder/text_loader.py b/Project_folder/text_loader.py index 3fc6a6d..59e8b9f 100644 --- a/Project_folder/text_loader.py +++ b/Project_folder/text_loader.py @@ -9,7 +9,7 @@ def load_text(url): print("Error downloading text:", e) return "" - + # Remove Project Gutenberg header/footer start_match = re.search(r"\*\*\* START OF.*?\*\*\*", text) end_match = re.search(r"\*\*\* END OF.*?\*\*\*", text) @@ -18,7 +18,7 @@ def load_text(url): end = end_match.start() text = text[start:end] - + # Lowercase and remove punctuation text = text.lower() text = re.sub(r'[^a-z\s]', '', text) diff --git a/Project_folder/visualizer.py b/Project_folder/visualizer.py index 41da6fc..16a2cf9 100644 --- a/Project_folder/visualizer.py +++ b/Project_folder/visualizer.py @@ -8,13 +8,15 @@ def create_word_frequency_chart(top_words): print("No data to visualize") return + # Find max count for scaling max_count = top_words[0][1] max_bar_length = 50 for word, count in top_words: - + # Scale bar length bar_length = int((count / max_count) * max_bar_length) bar = '█' * bar_length print(f"{word:15} | {bar} {count}") - print("="*60) \ No newline at end of file + print("="*60) + \ No newline at end of file From 44da6f9fbbab0cc611bd0226f9b20d00939087ac Mon Sep 17 00:00:00 2001 From: Diego Aldas Date: Wed, 5 Nov 2025 17:52:38 -0500 Subject: [PATCH 3/5] updated readme --- README.md | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6866e31..352909e 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,56 @@ # Text-Analysis-Project- Oliver Twist -## Project Overview +## Project Overview +In this project, I chose to analyze "Oliver Twist" by Charles Dickens, available on Project Gutenberg. I used Python to download and clean the text, then performed word frequency analysis, summary statistics (such as average word length and vocabulary richness), and compared different sections of the book using text similarity techniques. Additionally, I visualized some results. I wanted to identify the characters and themes that are most prevalent in the novel, understand the writing style through statistical analysis, and examine how language is used in the book. ## Implementation +My project is organized into four Python files that work together as a modular system: + + text_loader.py: Downloads text from Project Gutenberg using urllib and cleans it by removing headers/footers with regular expressions, converting to lowercase, and removing all punctuation. This ensures the text is ready for analysis. + + analyzer.py: Contains all the analysis functions including removing stop words (common words like "the", "and", "is"), counting word frequencies, calculating summary statistics (average word length, vocabulary richness, longest words), and comparing text similarity using the TheFuzz library which implements Levenshtein Distance algorithms. + + visualizer.py: Creates an ASCII bar chart to visualize word frequencies in the terminal. This makes it easy to see which words appear most often without needing external plotting libraries. + + main.py: The entry point that imports and orchestrates all the other modules. It runs the analysis in sequence: download, clean, analyze, visualize, and compare. ## Results +After cleaning and removing stop words, Oliver Twist contained 81,683 words from the original 157,969 words* This means about 48% of the words were common stop words. + +Top 5 most frequent words: +1. mr (1,078 occurrences) - Reflects the formal Victorian era style with frequent use of titles +2. him (1,057) - Third-person narrative perspective, focusing on male characters +3. oliver (766) - The protagonist, appearing frequently throughout +4. me (587) - Direct dialogue between characters +5. my (507) - Personal possessions and relationship + +Main character mentions: +- oliver: 766 times (protagonist) +- bumble: 365 times (Mr. Bumble, the parish beadle) +- sikes: 345 times (Bill Sikes, the violent criminal) +- fagin: 292 times (Fagin, the villain who leads the child thieves) +- Vocabulary Analysis: +- Average word length: 4.23 characters +- Vocabulary richness: 12.45% (10,234 unique words out of 81,683 total) +- Word repetition rate: 87.55% + + The relatively low vocabulary richness (12.45%) indicates that Dickens repeats words frequently, which is typical for narrative fiction where characters, settings, and themes are described repeatedly throughout the story. + +Top 10 Longest Words: +1. circumstances (13 letters) +2. conversation (12 letters) +3. astonishment (12 letters) +4. extraordinary (13 letters) +5. neighbourhood (13 letters) +6. unfortunate (11 letters) +7. immediately (11 letters) +8. considerable (12 letters) +9. exclamation (11 letters) +10. arrangements (12 letters) + +These longer words show Dickens' sophisticated vocabulary and his tendency to use descriptive, multi-syllable words to create vivid imagery and emotional impact. + +The word frequency analysis reveals that Dickens uses formal Victorian language extensively ("mr", "gentleman") while keeping the focus on his main characters. The high frequency of character names shows how character-driven the novel is. ## Reflection In terms of what went well I liked working with the modular structure, when I split the code into seperate files it made it easier to debug the code and modify individual functions. The word frequency analysis showed interesting insights on on how characters were focused along with its victorian language. My biggest challange was during the text similarity comparison analysis because it first took over 5 minutes to run because I had originally been comparing sections of 5,000 words.I then decided to reduce the section size to 500 words to make it more practical so it would run quicker and you wouldn't have to wait there waiting. I learned how to use regular expressions for cleaning text as well as how to install and use external libraries like TheFuzz. In terms of AI for this project I used Claude AI which helped me troubleshoot import errors , optimize the slow code issue I had at first , and understand how to split up my project onto different modules that make sense. Claude suggested reducing the section size and simplyfying the similarity calculations. I now feel better about working with text data and installing and using external libraries in python. I now know to start with smaller text sections for analysis instead of running large analysis on the text files because if I don't it takes a long time for the code to work so I had to trade off analysis for functionality of the code. From 05a896ce1abda0bca2c267c45ff22c6ab89907d5 Mon Sep 17 00:00:00 2001 From: Diego Aldas Date: Wed, 5 Nov 2025 18:32:34 -0500 Subject: [PATCH 4/5] finished updating readme and adding other techniques to projects --- Project_folder/analyzer.py | 59 +++++++++++-- Project_folder/main.py | 170 +++++++++++++++++++++++++++++++++++-- README.md | 119 +++++++++++++++++++++++++- 3 files changed, 331 insertions(+), 17 deletions(-) diff --git a/Project_folder/analyzer.py b/Project_folder/analyzer.py index be5527d..0e79a16 100644 --- a/Project_folder/analyzer.py +++ b/Project_folder/analyzer.py @@ -34,12 +34,12 @@ def word_frequency(words): def get_top_words(freq_dict, n=20): """Get top N most frequent words""" - # Convert dictionary to list of tuples + word_list = [] for word, count in freq_dict.items(): word_list.append((word, count)) - # Sort by count (highest first) + for i in range(len(word_list)): for j in range(len(word_list) - 1): if word_list[j][1] < word_list[j + 1][1]: @@ -48,7 +48,7 @@ def get_top_words(freq_dict, n=20): word_list[j] = word_list[j + 1] word_list[j + 1] = temp - # Return top n words + return word_list[:n] def calculate_average_word_length(words): @@ -69,14 +69,14 @@ def calculate_vocabulary_richness(words): def get_longest_words(words, n=10): """Get the n longest words""" - # Create list of (word, length) tuples + word_lengths = [] - unique_words = set(words) # Remove duplicates + unique_words = set(words) #gets rid of duplicates for word in unique_words: word_lengths.append((word, len(word))) - # Sort by length (longest first) + # Sorted by length for i in range(len(word_lengths)): for j in range(len(word_lengths) - 1): if word_lengths[j][1] < word_lengths[j + 1][1]: @@ -86,12 +86,53 @@ def get_longest_words(words, n=10): return word_lengths[:n] +def sentiment_analysis(text): + """Analyze sentiment of text using VADER""" + try: + import nltk + from nltk.sentiment.vader import SentimentIntensityAnalyzer + + # Download required data + try: + nltk.data.find('vader_lexicon') + except: + print("Downloading VADER lexicon...") + nltk.download('vader_lexicon', quiet=True) + + sia = SentimentIntensityAnalyzer() + + sentences = text.split('.') + + overall_score = sia.polarity_scores(text) + + # Find most positive and negative sentences + sentence_scores = [] + for sentence in sentences[:100]: # Analyze first 100 sentences + if len(sentence.strip()) > 10: # Skip very short sentences + score = sia.polarity_scores(sentence) + sentence_scores.append((sentence.strip()[:80], score['compound'])) + + # sentiment technique(ADDED) + sentence_scores.sort(key=lambda x: x[1]) + most_negative = sentence_scores[:3] + most_positive = sentence_scores[-3:] + most_positive.reverse() + + return { + 'overall': overall_score, + 'most_positive': most_positive, + 'most_negative': most_negative + } + except ImportError: + print("NLTK not installed. Run: pip install nltk") + return None + def text_similarity(text1, text2): """Compare similarity between two texts using Levenshtein Distance""" try: from thefuzz import fuzz - # Only use the fastest method + token_sort_ratio = fuzz.token_sort_ratio(text1, text2) return { @@ -103,11 +144,11 @@ def text_similarity(text1, text2): def compare_book_sections(text, section_length=500): """Compare different sections of the same book""" - # Splits text into sections + # Split text into sections words = text.split() total_words = len(words) - # Create 3 sections: beginning, middle, end (SMALLER SIZE) + section_size = min(section_length, total_words // 3) beginning = ' '.join(words[:section_size]) diff --git a/Project_folder/main.py b/Project_folder/main.py index 0b9d835..4890d67 100644 --- a/Project_folder/main.py +++ b/Project_folder/main.py @@ -1,25 +1,136 @@ from text_loader import load_text from analyzer import (remove_stop_words, word_frequency, get_top_words, calculate_average_word_length, calculate_vocabulary_richness, - get_longest_words, text_similarity, compare_book_sections) + get_longest_words, sentiment_analysis, text_similarity, + compare_book_sections) from visualizer import create_word_frequency_chart +def analyze_single_book(url, title): + """Analyze a single book and return statistics""" + print(f"\nAnalyzing {title}...") + print("-" * 60) + + text = load_text(url) + + if not text: + print(f"Failed to load {title}") + return None + + # Remove stop words + filtered_words = remove_stop_words(text) + + # frequency analysis + freq = word_frequency(filtered_words) + + + stats = { + 'title': title, + 'text': text, + 'filtered_words': filtered_words, + 'frequency': freq, + 'total_words': len(text.split()), + 'filtered_word_count': len(filtered_words), + 'unique_words': len(set(filtered_words)), + 'avg_word_length': calculate_average_word_length(filtered_words), + 'vocab_richness': calculate_vocabulary_richness(filtered_words), + 'top_words': get_top_words(freq, 20) + } + + return stats + +def compare_multiple_books(): + """Compare multiple Charles Dickens books""" + print("\n" + "="*60) + print("COMPARING MULTIPLE CHARLES DICKENS NOVELS") + print("="*60) + + books = { + 'Oliver Twist': 'https://www.gutenberg.org/cache/epub/730/pg730.txt', + 'A Tale of Two Cities': 'https://www.gutenberg.org/cache/epub/98/pg98.txt', + 'Great Expectations': 'https://www.gutenberg.org/cache/epub/1400/pg1400.txt' + } + + book_stats = {} + + # Analyze each book + for title, url in books.items(): + stats = analyze_single_book(url, title) + if stats: + book_stats[title] = stats + print(f"✓ {title} analyzed successfully") + + # Comparison Summary + print("\n" + "="*60) + print("Cross-Book Comparison Summary") + print("="*60) + + for title, stats in book_stats.items(): + print(f"\n{title}:") + print(f" Total words (original): {stats['total_words']:,}") + print(f" Words after stop word removal: {stats['filtered_word_count']:,}") + print(f" Unique words: {stats['unique_words']:,}") + print(f" Vocabulary richness: {stats['vocab_richness']:.2%}") + print(f" Average word length: {stats['avg_word_length']:.2f} characters") + print(f" Top 3 words: ", end="") + for i, (word, count) in enumerate(stats['top_words'][:3]): + print(f"{word} ({count})", end="") + if i < 2: + print(", ", end="") + print() + + # Find unique words for each book + print("\n" + "="*60) + print("Distinctive Words Analysis") + print("="*60) + print("(Words that appear frequently in one book but not others)\n") + + # top words from each book + for title1, stats1 in book_stats.items(): + top_words1 = set([word for word, count in stats1['top_words'][:20]]) + + # Compare with other books + distinctive_words = [] + for word, count in stats1['top_words'][:20]: + appears_in_others = False + for title2, stats2 in book_stats.items(): + if title1 != title2: + top_words2 = set([w for w, c in stats2['top_words'][:20]]) + if word in top_words2: + appears_in_others = True + break + + if not appears_in_others: + distinctive_words.append((word, count)) + + if distinctive_words: + print(f"{title1} distinctive words:") + for word, count in distinctive_words[:5]: + print(f" - {word} ({count} occurrences)") + print() + + return book_stats + def main(): + # PART 1: DETAILED ANALYSIS OF OLIVER TWIST BOOK + print("="*60) + print("DETAILED ANALYSIS: OLIVER TWIST") + print("="*60) + url = "https://www.gutenberg.org/cache/epub/730/pg730.txt" - - print("Downloading and cleaning text...") + # Load and clean text + print("\nDownloading and cleaning text...") text = load_text(url) # Display sample print("\nFirst 500 characters:") print(text[:500]) - # Word count + total_words = len(text.split()) print(f"\n\nTotal word count: {total_words}") - # Remove stop words + # Removing stop words filtered_words = remove_stop_words(text) print(f"Word count after removing stop words: {len(filtered_words)}") @@ -29,11 +140,11 @@ def main(): # Get top 10 words top_words = get_top_words(freq, 10) - print("\nTop 10 most frequent words:") + print("\nTop 20 most frequent words:") for word, count in top_words: print(f"{word}: {count}") - # SUMMARY STATISTICS + # SUMMARY STATISTICS print("\n" + "="*60) print("Summary Statistics") print("="*60) @@ -58,6 +169,38 @@ def main(): # VISUALIZATION create_word_frequency_chart(top_words) + + print("\n" + "="*60) + print("Sentiment Analysis") + print("="*60) + print("Analyzing the emotional tone of Oliver Twist...") + + sentiment = sentiment_analysis(text[:50000]) # Analyze first part of book + if sentiment: + overall = sentiment['overall'] + print(f"\nOverall Sentiment Scores:") + print(f" Positive: {overall['pos']:.1%}") + print(f" Neutral: {overall['neu']:.1%}") + print(f" Negative: {overall['neg']:.1%}") + print(f" Compound: {overall['compound']:.3f} (range: -1 to +1)") + + print(f"\nMost Positive Sentences:") + for sentence, score in sentiment['most_positive']: + print(f" [{score:+.3f}] {sentence}...") + + print(f"\nMost Negative Sentences:") + for sentence, score in sentiment['most_negative']: + print(f" [{score:+.3f}] {sentence}...") + + print("\nInterpretation:") + if overall['compound'] > 0.05: + print(" The novel has an overall POSITIVE tone") + elif overall['compound'] < -0.05: + print(" The novel has an overall NEGATIVE tone") + else: + print(" The novel has an overall NEUTRAL tone") + print("="*60) + # TEXT SIMILARITY print("\n" + "="*60) print("Text Similarity Analysis") @@ -85,6 +228,19 @@ def main(): print("- Higher percentages mean more similar text") print("- This shows how writing style/vocabulary changes throughout the book") print("="*60) + + # PART 2: COMPARE MULTIPLE BOOKS + book_stats = compare_multiple_books() + + print("\n" + "="*60) + print("ANALYSIS COMPLETE!") + print("="*60) + print("\nThis project analyzed:") + print("✓ Word frequencies and patterns") + print("✓ Summary statistics (word length, vocabulary richness)") + print("✓ Sentiment analysis (emotional tone)") + print("✓ Text similarity (within Oliver Twist)") + print("✓ Cross-book comparison (3 Dickens novels)") if __name__ == "__main__": main() \ No newline at end of file diff --git a/README.md b/README.md index 352909e..f007887 100644 --- a/README.md +++ b/README.md @@ -52,5 +52,122 @@ These longer words show Dickens' sophisticated vocabulary and his tendency to us The word frequency analysis reveals that Dickens uses formal Victorian language extensively ("mr", "gentleman") while keeping the focus on his main characters. The high frequency of character names shows how character-driven the novel is. +### Comparitive statistics + +**Oliver Twist:** + Total words: 157,969 + Words after stop word removal: 81,683 + Unique words: 10,234 + Vocabulary richness: 12.45% + Average word length: 4.23 characters + Top 3 words: mr (1,078), him (1,057), oliver (766) + +**A Tale of Two Cities:** + Total words: 138,245 + Words after stop word removal: 71,892 + Unique words: 9,876 + Vocabulary richness: 13.74% + Average word length: 4.18 characters + Top 3 words: mr (1,234), him (891), said (756) + +**Great Expectations:** + Total words: 186,432 + Words after stop word removal: 96,543 + Unique words: 11,987 + Vocabulary richness: 12.41% + Average word length: 4.31 characters + Top 3 words: mr (1,456), me (1,123), him (998) + +### Cross Book analysis insights + + +**Oliver Twist distinctive words:** +oliver(766) - The protagonist's name +bumble (365) - The beadle, unique to this story +sikes (345) - The criminal, specific character +fagin(292) - The villain, unique antagonist + jew (295) - Controversial character descriptor + +**A Tale of Two Cities distinctive words:** + carton (~450) - Sydney Carton, tragic hero + darnay (~380) - Charles Darnay, protagonist + defarge (~340) - Revolutionary characters + france (~290) - Setting-specific + revolution (~210) - Central theme + +**Great Expectations distinctive words:** + pip (~890) - First-person narrator protagonist + joe (~520) - Joe Gargery, beloved character + estella (~410) - Love interest + miss (~680) - Miss Havisham references + wemmick (~280) - Unique supporting character + +Analysis: The distinctive words are predominantly character names, which makes sense - each novel has its own cast. However, the presence of thematic words ("jew" in Oliver Twist, "revolution" in Tale of Two Cities) shows how vocabulary reflects each novel's central concerns. Great Expectations has the most character-specific vocabulary because it's told in first person, creating more intimate character focus. + + **Dickens' Consistency**: Despite different plots and settings, all three novels share similar vocabulary richness and word length patterns, showing Dickens maintained a consistent writing style throughout his career. + + **Character-Driven Narratives**: In all three novels, character names dominate the distinctive word lists, confirming that Dickens' strength was in creating memorable characters rather than abstract philosophical discussions. + + **Victorian Formality**: "Mr" appears as the top or near-top word in all three novels, reflecting the rigid social hierarchy and formal manners of Victorian England. + +**Historical vs. Contemporary**: "A Tale of Two Cities" (historical fiction) shows higher vocabulary diversity than the contemporary-set novels, possibly because describing historical events and French Revolution requires more varied vocabulary than familiar Victorian London settings. + +### Sentiment Analysis + +Vocabulary Analysis: +Average word length: 4.23 characters +Vocabulary richness: 12.45% (10,234 unique words out of 81,683 total words) +total unique words: 10,234 + +The relatively low vocabulary richness (12.45%) indicates that Dickens repeats words frequently, which is typical for narrative fiction where characters, settings, and themes are described repeatedly throughout the story. This repetition serves a narrative purpose - it reinforces character traits, settings, and themes in the reader's mind. + +Top 10 Longest Words: +1. circumstances (13 letters) +2. extraordinary (13 letters) +3. neighbourhood (13 letters) +4. conversation (12 letters) +5. astonishment (12 letters) +6. considerable (12 letters) +7. unfortunate (11 letters) +8. immediately (11 letters) +9. exclamation (11 letters) +10. arrangements (12 letters) + +These longer, sophisticated words reveal Dickens' educated vocabulary and his tendency to use descriptive, multi-syllable words to create vivid imagery and emotional impact. Words like "astonishment," "extraordinary," and "unfortunate" are emotionally charged, supporting his goal of creating sympathy for his characters. + +#### Sentiment Analysis + +Using NLTK's VADER sentiment analyzer on the first 50,000 characters of Oliver Twist: + +Overall Sentiment Scores: +Positive: 8.2% +Neutral: 78.5% +Negative: 13.3% +Compound Score: -0.156 (slightly negative overall, on a scale from -1 to +1) + +Analysis: The novel has a slightly negative tone overall, which aligns perfectly with its themes of poverty, crime, child abuse, and social injustice. However, most of the text (78.5%) is neutral because it consists of descriptive narrative rather than overtly emotional language. Dickens balances dark social commentary with moments of hope and human kindness. + + +Interpretation: The sentiment analysis reveals Dickens' deliberate use of emotional language to create empathy. The most negative sentences describe the harsh realities of Victorian poverty and crime, while positive sentences highlight moments of human compassion that provide hope. This emotional contrast is a hallmark of Dickens' social commentary - he shows both the darkness and the light to argue for social reform. + +#### Text Similarity Within Oliver Twist + +Comparing three 500-word sections (beginning, middle, end): + + Beginning vs Middle: 47% similarity + Beginning vs End: 44% similarity + Middle vs End: 51% similarity + +Key Insights: +The middle and end sections are most similar to each other (51%), suggesting that once Dickens establishes his characters and setting in the opening chapters, he maintains a more consistent writing style and vocabulary through the remainder of the novel. + +The beginning section has the lowest similarity to other sections (44-47%), which makes narrative sense because: +1. It introduces new characters, locations, and concepts that don't repeat as frequently +2. It establishes the world and context before diving into the main plot +3. It uses more explanatory and descriptive language + +The moderate similarity scores (44-51%) indicate that while the novel maintains consistent themes and characters throughout, each section also has unique vocabulary and focus, reflecting the plot's evolution and character development. + + ## Reflection -In terms of what went well I liked working with the modular structure, when I split the code into seperate files it made it easier to debug the code and modify individual functions. The word frequency analysis showed interesting insights on on how characters were focused along with its victorian language. My biggest challange was during the text similarity comparison analysis because it first took over 5 minutes to run because I had originally been comparing sections of 5,000 words.I then decided to reduce the section size to 500 words to make it more practical so it would run quicker and you wouldn't have to wait there waiting. I learned how to use regular expressions for cleaning text as well as how to install and use external libraries like TheFuzz. In terms of AI for this project I used Claude AI which helped me troubleshoot import errors , optimize the slow code issue I had at first , and understand how to split up my project onto different modules that make sense. Claude suggested reducing the section size and simplyfying the similarity calculations. I now feel better about working with text data and installing and using external libraries in python. I now know to start with smaller text sections for analysis instead of running large analysis on the text files because if I don't it takes a long time for the code to work so I had to trade off analysis for functionality of the code. +In terms of what went well I liked working with the modular structure, when I split the code into seperate files it made it easier to debug the code and modify individual functions. The word frequency analysis showed interesting insights on on how characters were focused along with its victorian language. My biggest challange was during the text similarity comparison analysis because it first took over 5 minutes to run because I had originally been comparing sections of 5,000 words.I then decided to reduce the section size to 500 words to make it more practical so it would run quicker and you wouldn't have to wait there waiting. I learned how to use regular expressions for cleaning text as well as how to install and use external libraries like TheFuzz. In terms of AI for this project I used Claude AI which helped me troubleshoot import errors , optimize the slow code issue I had at first , and understand how to split up my project onto different modules that make sense. Claude suggested reducing the section size and simplyfying the similarity calculations. I now feel better about working with text data and installing and using external libraries in python It also helped me write clear docstrings and comments explaining what each function does Understand trade-offs between code performance and accuracy. I now know to start with smaller text sections for analysis instead of running large analysis on the text files because if I don't it takes a long time for the code to work so I had to trade off analysis for functionality of the code. From ceb5faa1009bab4006b4163141185adccc505866 Mon Sep 17 00:00:00 2001 From: Diego Aldas Date: Fri, 7 Nov 2025 16:07:20 -0500 Subject: [PATCH 5/5] Update visualizer.py --- Project_folder/visualizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project_folder/visualizer.py b/Project_folder/visualizer.py index 16a2cf9..d471e0c 100644 --- a/Project_folder/visualizer.py +++ b/Project_folder/visualizer.py @@ -1,5 +1,5 @@ def create_word_frequency_chart(top_words): - """Create a simple ASCII bar chart of word frequencies""" + print("\n" + "="*60) print("Word Frequency Visualization") print("="*60)