-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLoadPreProcess.py
More file actions
102 lines (80 loc) · 3.04 KB
/
LoadPreProcess.py
File metadata and controls
102 lines (80 loc) · 3.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# legislation_topics.py
import os
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
# -----------------------------
# STEP 1: Preprocessing
# -----------------------------
def load_legislation_texts(folder_path):
texts = []
for file in os.listdir(folder_path):
if file.endswith(".txt"):
with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
texts.append(f.read())
return texts
def preprocess_text(text):
text = text.lower()
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'\[\d+\]', '', text)
words = word_tokenize(text)
stop_words = set(stopwords.words("english"))
tokens = [w for w in words if w.isalnum() and w not in stop_words]
return ' '.join(tokens)
# -----------------------------
# STEP 2: Embeddings (LegalBERT)
# -----------------------------
def get_embeddings(texts):
print("Loading LegalBERT...")
model = SentenceTransformer("nlpaueb/legal-bert-base-uncased")
print("Generating embeddings...")
return model.encode(texts, show_progress_bar=True)
# -----------------------------
# STEP 3: Topic Modeling
# -----------------------------
def run_topic_modeling(docs, embeddings):
print("Running BERTopic...")
topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(docs, embeddings)
return topic_model, topics, probs
# -----------------------------
# STEP 4: Coherence Score
# -----------------------------
def compute_coherence(topic_model, docs):
print("Calculating topic coherence...")
topics = topic_model.get_topics()
topic_words = [[word for word, _ in topic_model.get_topic(topic)] for topic in topics.keys()]
tokenized_docs = [doc.split() for doc in docs]
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(text) for text in tokenized_docs]
coherence_model = CoherenceModel(
topics=topic_words,
texts=tokenized_docs,
dictionary=dictionary,
coherence='c_v'
)
return coherence_model.get_coherence()
# -----------------------------
# MAIN SCRIPT
# -----------------------------
def main():
nltk.download("punkt")
nltk.download("stopwords")
folder_path = "." # assumes .txt files are in current dir
raw_texts = load_legislation_texts(folder_path)
print(f"Loaded {len(raw_texts)} documents.")
preprocessed_texts = [preprocess_text(t) for t in raw_texts]
embeddings = get_embeddings(preprocessed_texts)
topic_model, topics, probs = run_topic_modeling(preprocessed_texts, embeddings)
coherence = compute_coherence(topic_model, preprocessed_texts)
print(f"🧠 Topic Coherence Score (C_v): {coherence:.4f}")
print("📊 Launching interactive topic visualization...")
topic_model.visualize_topics().show()
if __name__ == "__main__":
main()