forked from sapai5/SoDA2024Hack
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerateData.py
More file actions
93 lines (71 loc) · 3.54 KB
/
generateData.py
File metadata and controls
93 lines (71 loc) · 3.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import re
import json
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
# Set up Selenium WebDriver (make sure the path to your chromedriver is correct)
options = ChromeOptions()
driver = webdriver.Chrome(options=options) # Replace with your chromedriver path
# The URL for the ASU class catalog page
url = 'https://catalog.apps.asu.edu/catalog/classes/classlist?campusOrOnlineSelection=C&honors=F&level=undergrad&promod=F&searchType=all&term=2251'
# Open the URL
driver.get(url)
# Function to extract course tags (like WPC 480) using a regular expression to ensure proper format
def extract_courses_from_page(page_source, unique_courses):
soup = BeautifulSoup(page_source, 'html.parser')
# Find all course tags (e.g., "WPC 480") within <span> tags with class 'bold-hyperlink'
course_tags = soup.find_all('span', class_='bold-hyperlink')
# Regular expression to match valid course codes (e.g., "ABC 123")
course_code_pattern = re.compile(r'^[A-Z]{3} \d{3}$')
# Collect and store only valid course codes
for course in course_tags:
course_text = course.text.strip()
if course_code_pattern.match(course_text): # Only add valid course codes
if course_text not in unique_courses:
unique_courses.add(course_text) # Add to the set for uniqueness
return unique_courses
# Store all unique course tags
unique_courses = set() # Use a set to ensure no duplicates
# Extract course tags from the first page
unique_courses = extract_courses_from_page(driver.page_source, unique_courses)
time.sleep(60)
# Navigate through pages
while True:
try:
# Scroll down to make sure the "Next" button is visible
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5) # Adjust wait time if necessary
# Find the 'Next' button using a combination of class name and aria-label
next_button = driver.find_element(By.XPATH, '//a[@aria-label="Next page" and contains(@class, "page-link")]')
# Check if the 'Next' button is disabled by checking the aria-disabled attribute
if next_button.get_attribute('aria-disabled') == 'true':
break # Exit if the 'Next' button is disabled
# Click the 'Next' button using JavaScript (sometimes it's more reliable)
driver.execute_script("arguments[0].click();", next_button)
# Wait for the page to load (adjust the time if necessary)
time.sleep(5)
# Extract course tags from the next page
unique_courses = extract_courses_from_page(driver.page_source, unique_courses)
# Print the list of unique course tags after each page
print("Current list of course tags after this page:")
for course in sorted(unique_courses): # Sorted for better readability
print(f"Course: {course}")
except Exception as e:
print("No more pages or an error occurred:", e)
break
# Close the Selenium browser
driver.quit()
# Convert the set of unique courses to a sorted list for saving to JSON
course_list = sorted(unique_courses)
# Write the unique course list to a JSON file
with open('scrape.json', 'w') as f:
json.dump(course_list, f, indent=4)
# Print the final list of unique scraped course tags
print("Final list of unique course tags saved to scrape.json:")
if unique_courses:
for course in course_list: # Sorted for better readability
print(f"Course: {course}")
else:
print("No courses were scraped.")