-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_scraper.py
More file actions
119 lines (101 loc) · 3.72 KB
/
data_scraper.py
File metadata and controls
119 lines (101 loc) · 3.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import re
# Set the base URL and the directory to save the data
base_url = 'https://uom.edu.pk'
# The output directory will be created inside your Codespace
output_dir = 'university_data'
# A set to store visited URLs to avoid infinite loops and duplicate scraping
visited_urls = set()
# A list to store the URLs of PDF files
pdf_links = []
# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Created directory: {output_dir}")
def is_valid(url):
"""
Check if a URL is valid and belongs to the same domain.
"""
parsed = urlparse(url)
return parsed.scheme in ['http', 'https'] and parsed.netloc == urlparse(base_url).netloc
def get_all_links(url):
"""
Fetch a page and get all valid links from it.
"""
try:
response = requests.get(url, timeout=5)
response.raise_for_status() # Raise an error for bad status codes
soup = BeautifulSoup(response.text, 'html.parser')
links = set()
for anchor in soup.find_all('a', href=True):
href = anchor['href']
full_url = urljoin(url, href)
# Remove anchor part of URL
if '#' in full_url:
full_url = full_url.split('#')[0]
if is_valid(full_url) and full_url not in visited_urls:
links.add(full_url)
# Add the current URL to visited set before returning
visited_urls.add(url)
return list(links), soup
except requests.exceptions.RequestException as e:
print(f"Error fetching {url}: {e}")
return [], None
def save_content(url, content):
"""
Save the extracted text content of a page to a file.
"""
path = urlparse(url).path
filename = path.strip('/').replace('/', '_') or 'index'
filepath = os.path.join(output_dir, f"{filename}.txt")
# Simple check to avoid overwriting files, can be more robust
if not os.path.exists(filepath):
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Saved content from {url} to {filepath}")
else:
print(f"File already exists, skipping: {filepath}")
def find_pdfs(soup, url):
"""
Find and store links to PDF files.
"""
for link in soup.find_all('a', href=True):
href = link['href']
full_url = urljoin(url, href)
if full_url.lower().endswith('.pdf'):
if full_url not in pdf_links:
pdf_links.append(full_url)
print(f"Found PDF link: {full_url}")
def crawl(start_url):
"""
Main crawling function to start the process.
"""
queue = [start_url]
visited_urls.add(start_url)
while queue:
current_url = queue.pop(0)
print(f"Crawling: {current_url}")
links, soup = get_all_links(current_url)
if soup:
# Extract text from the page
page_text = ' '.join([p.text for p in soup.find_all('p')])
# Clean up extra whitespace and newlines
page_text = re.sub(r'\s+', ' ', page_text).strip()
if page_text:
save_content(current_url, page_text)
# Find PDFs
find_pdfs(soup, current_url)
# Add new links to the queue
for link in links:
if link not in visited_urls:
visited_urls.add(link)
queue.append(link)
# Start the crawling process
print("Starting the web scraper...")
crawl(base_url)
print("\nScraping complete!")
print(f"Found {len(visited_urls)} total pages.")
print(f"Found {len(pdf_links)} PDF links.")