ChatSCE/class_scraper_util.py at dev · Yeezusson21/ChatSCE · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
from bs4 import BeautifulSoup
import re
import requests
import json
import unicodedata
from urllib.parse import urlparse, parse_qs
from collections import defaultdict

# All URLs go here
# Make sure to use the SJSU course catalog otherwise the scraper won't work
# Currently contains the catalog for CMPE, CS, SE, EE, MATH, ISE, and ENGR
# url_list = [
#     "https://catalog.sjsu.edu/content.php?filter%5B27%5D=CMPE&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid=14&expand=&navoid=5106&search_database=Filter&filter%5Bexact_match%5D=1#acalog_template_course_filter",
#     "https://catalog.sjsu.edu/content.php?catoid=14&navoid=5106&filter%5B27%5D=CMPE&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=2&filter%5Bexact_match%5D=1&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1#acalog_template_course_filter",
#     "https://catalog.sjsu.edu/content.php?filter%5B27%5D=SE&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid=14&expand=&navoid=5106&search_database=Filter&filter%5Bexact_match%5D=1#acalog_template_course_filter",
#     "https://catalog.sjsu.edu/content.php?filter%5B27%5D=CS&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid=14&expand=&navoid=5106&search_database=Filter&filter%5Bexact_match%5D=1#acalog_template_course_filter",
#     "https://catalog.sjsu.edu/content.php?filter%5B27%5D=EE&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid=14&expand=&navoid=5106&search_database=Filter&filter%5Bexact_match%5D=1#acalog_template_course_filter",
#     "https://catalog.sjsu.edu/content.php?catoid=14&navoid=5106&filter%5B27%5D=EE&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=2&filter%5Bexact_match%5D=1&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1#acalog_template_course_filter",
#     "https://catalog.sjsu.edu/content.php?filter%5B27%5D=MATH&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid=14&expand=&navoid=5106&search_database=Filter&filter%5Bexact_match%5D=1#acalog_template_course_filter",
#     "https://catalog.sjsu.edu/content.php?catoid=14&navoid=5106&filter%5B27%5D=MATH&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=2&filter%5Bexact_match%5D=1&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1#acalog_template_course_filter",
#     "https://catalog.sjsu.edu/content.php?filter%5B27%5D=ISE&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid=14&expand=&navoid=5106&search_database=Filter&filter%5Bexact_match%5D=1#acalog_template_course_filter",
#     "https://catalog.sjsu.edu/content.php?filter%5B27%5D=ENGR&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid=14&expand=&navoid=5106&search_database=Filter&filter%5Bexact_match%5D=1#acalog_template_course_filter",
# ]

# Additional courses like BIO, CHEM, PHYS, BME, DATA, STAT, TECH can be added here
# Can also be combined with the previous list but it will result in a significantly longer runtime due to how many courses there are
# So it's better to rename the output json file and run the scraper again with a different list if you have more departments to add
url_list = [
    "https://catalog.sjsu.edu/content.php?filter%5B27%5D=BIOL&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid=14&expand=&navoid=5106&search_database=Filter&filter%5Bexact_match%5D=1#acalog_template_course_filter",
    "https://catalog.sjsu.edu/content.php?catoid=14&navoid=5106&filter%5B27%5D=BIOL&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=2&filter%5Bexact_match%5D=1&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1#acalog_template_course_filter",
    "https://catalog.sjsu.edu/content.php?filter%5B27%5D=CHEM&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid=14&expand=&navoid=5106&search_database=Filter&filter%5Bexact_match%5D=1#acalog_template_course_filter",
    "https://catalog.sjsu.edu/content.php?filter%5B27%5D=PHYS&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid=14&expand=&navoid=5106&search_database=Filter&filter%5Bexact_match%5D=1#acalog_template_course_filter",
    "https://catalog.sjsu.edu/content.php?filter%5B27%5D=BME&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid=14&expand=&navoid=5106&search_database=Filter&filter%5Bexact_match%5D=1#acalog_template_course_filter",
    "https://catalog.sjsu.edu/content.php?filter%5B27%5D=DATA&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid=14&expand=&navoid=5106&search_database=Filter&filter%5Bexact_match%5D=1#acalog_template_course_filter",
    "https://catalog.sjsu.edu/content.php?filter%5B27%5D=STAT&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid=14&expand=&navoid=5106&search_database=Filter&filter%5Bexact_match%5D=1#acalog_template_course_filter",
    "https://catalog.sjsu.edu/content.php?filter%5B27%5D=TECH&filter%5B29%5D=&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid=14&expand=&navoid=5106&search_database=Filter&filter%5Bexact_match%5D=1#acalog_template_course_filter",
]

# TODO Implement topological sort to determine prerequisites and corequisites
class ClassGraph:
    def __init__(self):
        self.graph= defaultdict(list)

    def add_edge(self, u, v ):
        self.graph[u].append(v)

    #DFS search
    def top_sort_helper(self, node, visited, stack):
        visited[node.title]= True

        for neighbor in self.graph[node.title]:
            if not visited[neighbor.title]:
                self.top_sort_helper(neighbor,visited,stack)
        stack.append(node)

    def topsort(self):
        visited = {node.title: False for node in self.graph}
        stack =[]
        for node in self.graph:
            if not visited[node.title]:
                self.top_sort_helper(node,visited,stack)
        return stack[::-1]

# TODO Implement a way to convert JSON to ClassNode objects
class ClassNode:
    def __init__(
        self, course_name, units, description, prereqs, coreqs, grading_type, note
    ):
        self.title = course_name
        self.units = units
        self.description = description
        self.prereqs = prereqs
        self.coreqs = coreqs
        self.grading_type = grading_type
        self.note = note

    #Case 1: general class format "Math 30X, Math30" or No reqs needed
    #Case 2: reqs that change depending on your major --> "CMPE major: CMPE 126; SE major: CS 46B other majors: CMPE 30 , MATH 32 or MATH 32H or MATH 32X."
    #Case 3: everything else (loosely related reqs) --> "Upper division standing or instructor consent"
    @staticmethod
    def extract_course_titles(reqs_string):
        if "No" in reqs_string:
            return reqs_string

        courses = []
        pattern = r'\b[A-Z]+\s+\d+[A-Z]*\b(?:\s+or\s+[A-Z]+\s+\d+[A-Z]*)*'
        matches = re.findall(pattern, reqs_string)
        if not matches: # if the pattern falls within case 3
            return reqs_string

        for match in matches:
            if " or " in match:
                match = match.replace(" or ", "/")
            courses.append(match)

        return courses

    # TODO account for Case 2

    @classmethod
    def from_json(cls,json_data):
        return cls(json_data['title'],
                         json_data['units'],
                         json_data['description'],
                         ClassNode.extract_course_titles(json_data['prereqs']),
                         ClassNode.extract_course_titles(json_data['coreqs']),
                         json_data['grading_type'],
                         json_data['note'])

    @classmethod
    def convert_json_data(cls, json_data):
        return [cls.from_json(node) for node in json_data]

    def to_string(self):
        return f"Title: {self.title}\nDescription: {self.description}\nUnits: {self.units}\nPrerequisites: {self.prereqs}\nCorequisites: {self.coreqs}\nGrading Type: {self.grading_type}\nNote: {self.note}\n"

    def __str__(self):
        return json.dumps(self.__dict__)

    def __repr__(self):
        return f"Title: {self.title}\nDescription: {self.description}\nUnits: {self.units}\nPrerequisites: {self.prereqs}\nCorequisites: {self.coreqs}\nGrading Type: {self.grading_type}\nNote: {self.note}\n"

file_path = "core_classes.json"

with open(file_path, "r") as file:
    json_data = json.load(file)

class Scraper:
    def __init__(self, urls=[]):
        self.url_list = urls
        self.html_list = []
        self.filtered_hrefs = []
        self.class_pages = []
        self.td_tags = []

    def scrape(self) -> list[BeautifulSoup]:
        print("Scraping...")
        self.html_list = []
        for url in self.url_list:
            response = requests.get(url)
            if response.status_code == 200:
                doc = BeautifulSoup(response.text, "html.parser")
                self.html_list.append(doc)
            else:
                raise Exception("Request failed for", url)
        return self.html_list

    def filter_hrefs(self) -> list[str]:
        print("Filtering hrefs...")
        self.filtered_hrefs = []
        target_departments = set()

        for url in self.url_list:
            parsed_url = urlparse(url)
            captured = parse_qs(parsed_url.query)['filter[27]'][0]
            target_departments.add(captured)

        for doc in self.html_list:
            links = doc.find_all("a")
            for link in links:
                title = link.get("title")
                if title and any(
                    department in title for department in target_departments
                ):
                    href = link.get("href")
                    self.filtered_hrefs.append(href)

        return self.filtered_hrefs

    def get_class_pages(self) -> list[str]:
        print("Getting class pages...")
        self.class_pages = []
        for href in self.filtered_hrefs:
            self.class_pages.append("https://catalog.sjsu.edu/" + href)
        return self.class_pages

    def grab_td_tags(self) -> list[BeautifulSoup]:
        print("Grabbing tags...")
        self.td_tags = []
        for idx, class_page in enumerate(self.class_pages):
            print(f"Processing {idx} / {len(self.class_pages)}...")
            response = requests.get(class_page)
            if response.status_code == 200:
                doc = BeautifulSoup(response.text, "html.parser")
                self.td_tags.extend(
                    doc.find_all("td", class_="block_content", colspan="2")
                )
            else:
                print("Error: Request failed for", class_page)
                raise Exception("Request failed for", class_page)
        return self.td_tags

    def extract(self, pattern: str, text: str, error: str, dotall=False):
        if dotall:
            match = re.search(pattern, text, re.DOTALL)
        else:
            match = re.search(pattern, text)

        if match:
            result = match.group(1).strip()
        else:
            result = error

        return self.normalize(result)

    def normalize(self, text: str):
        return (
            unicodedata.normalize("NFKD", text)
            .encode("ascii", "ignore")
            .decode("utf-8")
            .strip()
            .replace("  ", " ")
        )

    # If someone finds another way than regex please implement it
    def extract_course_info(self) -> list[ClassNode]:
        print("Extracting course info...")
        nodes = []
        for idx, tag in enumerate(self.td_tags):
            print(f"Processing {idx} / {len(self.td_tags)}...")
            # Get course html block
            title = tag.find("p")
            if title:
                title = title.text.strip()
            else:
                title = "Title not found"

            # Find course name
            course_name = tag.find("h1", id="course_preview_title")
            if course_name:
                course_name = course_name.text.strip()
            else:
                course_name = "Course name not found"

            course_name = self.normalize(course_name)

            # Find units
            units = self.extract(r"(\d+(-\d+)? unit\(s\))", title, "Units not found")

            # Find course description
            course_description = self.extract(
                r"(?<=unit\(s\)\s)(.*?)(?=Prerequisite\(s\):|Corequisite\(s\):|Grading:)",
                title,
                "Description not found",
                dotall=True,
            )

            prereqs = self.extract(
                r"(?<=Prerequisite\(s\): )(.*?)(?=Corequisite\(s\):|Grading:)",
                title,
                "No prerequisites",
                dotall=True,
            )

            coreqs = self.extract(
                r"(?<=Corequisite\(s\): )(.*?)(?=Grading:)",
                title,
                "No corequisites",
                dotall=True,
            )

            # Find grading type
            grading_type = self.extract(
                r"Grading:\s(.*?)(?=Note\(s\)|Class Schedule)",
                title,
                "Grading type not found",
            )

            # Check if Note(s) or anything after "Note(s):" but before "Class Schedule" exist
            note = self.extract(
                r"(?<=Note\(s\): )(.*?)(?=Class Schedule)", title, "No additional notes"
            )

            nodes.append(
                ClassNode(
                    course_name=course_name,
                    units=units,
                    description=course_description,
                    prereqs=prereqs,
                    coreqs=coreqs,
                    grading_type=grading_type,
                    note=note,
                )
            )

        return nodes


scraper = Scraper(url_list)
scraper.scrape()
scraper.filter_hrefs()
scraper.get_class_pages()
scraper.grab_td_tags()
nodes = scraper.extract_course_info()
for node in nodes:
    print(node.to_string())

with open("output.json", "w") as f:
    json.dump([node.__dict__ for node in nodes], f, indent=4)