LegislativeDocs/LoadPreProcess.py at main · RutgersComLab/LegislativeDocs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# legislation_topics.py

import os
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary


# -----------------------------
# STEP 1: Preprocessing
# -----------------------------
def load_legislation_texts(folder_path):
    texts = []
    for file in os.listdir(folder_path):
        if file.endswith(".txt"):
            with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
                texts.append(f.read())
    return texts


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\[\d+\]', '', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    tokens = [w for w in words if w.isalnum() and w not in stop_words]
    return ' '.join(tokens)


# -----------------------------
# STEP 2: Embeddings (LegalBERT)
# -----------------------------
def get_embeddings(texts):
    print("Loading LegalBERT...")
    model = SentenceTransformer("nlpaueb/legal-bert-base-uncased")
    print("Generating embeddings...")
    return model.encode(texts, show_progress_bar=True)


# -----------------------------
# STEP 3: Topic Modeling
# -----------------------------
def run_topic_modeling(docs, embeddings):
    print("Running BERTopic...")
    topic_model = BERTopic(verbose=True)
    topics, probs = topic_model.fit_transform(docs, embeddings)
    return topic_model, topics, probs


# -----------------------------
# STEP 4: Coherence Score
# -----------------------------
def compute_coherence(topic_model, docs):
    print("Calculating topic coherence...")
    topics = topic_model.get_topics()
    topic_words = [[word for word, _ in topic_model.get_topic(topic)] for topic in topics.keys()]

    tokenized_docs = [doc.split() for doc in docs]
    dictionary = Dictionary(tokenized_docs)
    corpus = [dictionary.doc2bow(text) for text in tokenized_docs]

    coherence_model = CoherenceModel(
        topics=topic_words,
        texts=tokenized_docs,
        dictionary=dictionary,
        coherence='c_v'
    )
    return coherence_model.get_coherence()


# -----------------------------
# MAIN SCRIPT
# -----------------------------
def main():
    nltk.download("punkt")
    nltk.download("stopwords")

    folder_path = "."  # assumes .txt files are in current dir
    raw_texts = load_legislation_texts(folder_path)
    print(f"Loaded {len(raw_texts)} documents.")

    preprocessed_texts = [preprocess_text(t) for t in raw_texts]
    embeddings = get_embeddings(preprocessed_texts)
    topic_model, topics, probs = run_topic_modeling(preprocessed_texts, embeddings)

    coherence = compute_coherence(topic_model, preprocessed_texts)
    print(f"🧠 Topic Coherence Score (C_v): {coherence:.4f}")

    print("📊 Launching interactive topic visualization...")
    topic_model.visualize_topics().show()


if __name__ == "__main__":
    main()