Text-Analysis-Project/text_processing.py at main · L3B3/Text-Analysis-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# yfiance API (to obtain live market Data
import os
import re
from collections import Counter
from math import floor
#def api_access(api_key: str):

    #url = [('https://newsapi.org/v2/everything?q=NFL&sortBy=popularity&apiKey=fd772e4022f1491994dc98392a019790')]
    #response = requests.get(url)
    #print(response)
import os

def read_gutenberg(path: str, skip_header: bool = True, encoding_primary="utf-8", encoding_fallback="latin-1") -> str:
    """
    Read a Project Gutenberg .txt file, optionally skipping the boilerplate
    header and footer. Returns the file's main content as a single string.
    """
    def read_with_encoding(enc: str) -> str:
        lines = []
        with open(path, "r", encoding=enc, errors="strict") as f:
            # --- Skip the header (until we hit the START marker) ---
            if skip_header:
                for line in f:
                    up = line.upper()
                    if up.startswith("*** START OF") or "START OF THE PROJECT" in up:
                        break
            # --- Read main content; stop at END marker if skipping header ---
            for line in f:
                up = line.upper()
                if skip_header and (up.startswith("*** END OF") or "END OF THE PROJECT" in up):
                    break
                lines.append(line)
        return "".join(lines)

    # Try primary encoding, then fallback if needed
    try:
        return read_with_encoding(encoding_primary)
    except UnicodeDecodeError:
        return read_with_encoding(encoding_fallback)

def Load_Txt(folder: str, skip_header: bool = True):
    """
    Load all .txt files in a folder.
    Returns: list of (filename, text) with Gutenberg boilerplate removed if requested.
    """
    out = []
    if not os.path.isdir(folder):
        print(f"[!] Folder '{folder}' not found.")
        return out

    for fn in os.listdir(folder):
        if fn.lower().endswith(".txt"):
            path = os.path.join(folder, fn)
            try:
                text = read_gutenberg(path, skip_header=skip_header, encoding_primary="utf-8", encoding_fallback="latin-1")
                out.append((fn, text))
            except Exception as e:
                print(f"[!] Could not read '{fn}': {e}")

    print(f"You have loaded {len(out)} Text files into the Processor")
    return out


def clean_and_filter(text: str) -> str:
    """Lowercase, keep only letters, and remove stopwords/pronouns."""
    Filter_Words = {# --- Stopwords ---
    'the','and','a','to','of','in','is','it','for','on','was','as','at',
    'with','be','by','an','are','were','from','this','which',

    # --- Pronouns ---
    'i','you','he','she','it','we','they','me','him','her','us','them',
    'my','your','his','its','our','their','mine','yours','hers','ours','theirs',
    'myself','yourself','himself','herself','itself','ourselves','yourselves','themselves',
    'this','that','these','those',
    'who','whom','whose','which','what',
    'all','another','any','anybody','anyone','anything','both','each','either',
    'everybody','everyone','everything','few','many','most','neither','nobody',
    'none','no','one','no one','other','others','several','some','somebody',
    'someone','something','such'}
    text = text.lower()
    cleaned = ""
    filtered= []
    for i in text:
        if i.isalpha() or i.isspace():
            cleaned += i
    words = cleaned.split()
    for i in words:
        if i not in Filter_Words:
            filtered.append(i)
    return " ".join(filtered)

def count_words(text: str) -> dict:
    counts = {}
    for word in text.split():
        counts[word] = counts.get(word, 0) + 1
    return counts

def summary_stats(text: str) -> dict:
    words = text.split()
    unique = set(words)
    total= 0
    for i in words:
        total+= len(i)
    Longest_lenght = max
    avg_word_len = total / max(len(words), 1) #
    print("This is an overview of the files!")
    return{"words": len(words),
        "unique_words": len(unique),
        "avg_word_len": avg_word_len,}

def main():
    folder = "text"
    files = Load_Txt(folder)
    if not files:
        return

    all_counts = {}

    for name, text in files:
        cleaned = clean_and_filter(text)
        counts = count_words(cleaned)

        print(f"\nTop words in {name}:")
        top = sorted(counts.items(), key=lambda x: x[1], reverse=True)[:10]
        for w, c in top:
            print(f"  {w:<12} {c}")

        # combine into overall counts
        for w, c in counts.items():
            all_counts[w] = all_counts.get(w, 0) + c

    print("\nOverall most common words:")
    top_all = sorted(all_counts.items(), key=lambda x: x[1], reverse=True)[:10]
    for w, c in top_all:
        print(f"  {w:<12} {c}")
    stats = summary_stats(cleaned)
    print(f"  words={stats['words']}  unique={stats['unique_words']}  Avg Word Length={stats['avg_word_len']:.3f}")

if __name__ == "__main__":
    main()