forked from prodbrandon/equalify
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
72 lines (55 loc) · 2.62 KB
/
scrape.py
File metadata and controls
72 lines (55 loc) · 2.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm # Import the tqdm library for the progress bar
import json
# Step 1: Make requests to the website for multiple pages
base_url = 'https://scholarships.asu.edu/scholarship-search&page='
urls = [f'{base_url}{i}' for i in range(0, 8)] # Generate URLs for pages 0 to 7
links = []
# Loop through each page URL
for url in urls:
response = requests.get(url)
# Step 2: Parse the content of the page
soup = BeautifulSoup(response.content, 'html.parser')
# Step 3: Find all the links on the page
for link in soup.find_all('a', href=True): # Find all anchor tags with href attribute
href = link['href']
# Make sure the link is complete (handle relative URLs)
if href.startswith('http'):
links.append(href)
else:
# If the link is relative, convert it to an absolute URL
full_url = requests.compat.urljoin(url, href)
links.append(full_url)
# Step 4: Only include links with the pattern "https://scholarships.asu.edu/scholarship/" followed by numbers
filtered_links = [link for link in links if link.startswith('https://scholarships.asu.edu/scholarship/')]
scholarships = []
# Step 5: Extract ID numbers and scrape each filtered link for its HTML description using div parent method
for link in tqdm(filtered_links, desc="Scraping scholarship pages"): # Add a progress bar to the loop
# Extract the ID number (assumes ID is at the end of the URL after the last '/')
id_number = link.split('/')[-1]
scholarship = {
"id": id_number
}
# Make a request to the individual scholarship page to get its content
response = requests.get(link)
scholarship_soup = BeautifulSoup(response.content, 'html.parser')
# Find the h1 element with id "page-title" to locate the relevant div
h1_element = scholarship_soup.find('h1', id='page-title')
if h1_element:
# Find the parent div of the h1 element (assuming description is within the same div)
parent_div = h1_element.find_parent('div')
if parent_div:
# Get the text content of the parent div (this should contain the description)
description = parent_div.get_text(strip=True, separator=' ')
scholarship["description"] = description
else:
scholarship["description"] = "Parent div not found"
else:
scholarship["description"] = "H1 element not found"
scholarships.append(scholarship)
# Print the results
print(scholarships)
# save to json named scrape.json
with open('scrape.json', 'w') as f:
json.dump(scholarships, f, indent=2)