-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler.py
More file actions
177 lines (146 loc) · 6.36 KB
/
crawler.py
File metadata and controls
177 lines (146 loc) · 6.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import requests
from bs4 import BeautifulSoup
import urllib.parse
import logging
import trafilatura
from urllib.parse import urlparse, urljoin
import time
import re
def validate_url(url):
"""
Validate if the provided URL has a valid format.
Args:
url (str): URL to validate
Returns:
bool: True if valid URL format, False otherwise
"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc]) and result.scheme in ['http', 'https']
except Exception as e:
logging.error(f"URL validation error: {str(e)}")
return False
def normalize_url(url, base_url):
"""
Normalize relative URLs to absolute URLs.
Args:
url (str): URL to normalize
base_url (str): Base URL for relative URLs
Returns:
str: Normalized URL
"""
try:
return urljoin(base_url, url)
except Exception:
return None
def crawl_website(url, instructions=None, max_pages=20, max_depth=2):
"""
Crawl a website and extract information based on user instructions.
Args:
url (str): The URL to crawl
instructions (str, optional): Custom instructions for crawling
max_pages (int): Maximum number of pages to crawl
max_depth (int): Maximum depth of crawling
Returns:
dict: Crawl results including links, text content, and metadata
"""
if not validate_url(url):
raise ValueError("Invalid URL format")
# Parse custom instructions
depth_match = re.search(r'depth[:\s]+(\d+)', instructions or '', re.IGNORECASE)
if depth_match:
max_depth = min(int(depth_match.group(1)), 5) # Limit to 5 for safety
pages_match = re.search(r'pages[:\s]+(\d+)', instructions or '', re.IGNORECASE)
if pages_match:
max_pages = min(int(pages_match.group(1)), 50) # Limit to 50 for safety
# Initialize the crawl results
visited = set()
to_visit = [(url, 0)] # (url, depth)
results = {
'links': [],
'text': '',
'metadata': {
'title': '',
'description': '',
'base_url': url,
'pages_crawled': 0,
'crawl_time': 0
},
'page_data': []
}
# Track specific patterns based on instructions
track_images = 'images' in (instructions or '').lower()
track_headings = 'headings' in (instructions or '').lower()
start_time = time.time()
while to_visit and len(visited) < max_pages:
current_url, current_depth = to_visit.pop(0)
if current_url in visited or current_depth > max_depth:
continue
try:
# Add to visited set
visited.add(current_url)
# Fetch the page content
logging.info(f"Crawling: {current_url}")
headers = {'User-Agent': 'Mozilla/5.0 WebsiteCrawler/1.0'}
response = requests.get(current_url, headers=headers, timeout=10)
response.raise_for_status() # Raise exception for bad status codes
# Parse with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Get metadata
page_title = soup.title.string if soup.title else "No Title"
page_description = ''
meta_desc = soup.find('meta', attrs={'name': 'description'})
if meta_desc and 'content' in meta_desc.attrs:
page_description = meta_desc['content']
# Extract text content using trafilatura for better text extraction
downloaded = trafilatura.fetch_url(current_url)
page_text = trafilatura.extract(downloaded) or "No text content extracted"
# Add to results
page_data = {
'url': current_url,
'title': page_title,
'description': page_description,
'text_sample': page_text[:200] + "..." if len(page_text) > 200 else page_text,
'depth': current_depth
}
# Track additional elements based on instructions
if track_images:
page_data['images'] = [img.get('src', '') for img in soup.find_all('img') if img.get('src')]
if track_headings:
page_data['headings'] = []
for i in range(1, 7):
for heading in soup.find_all(f'h{i}'):
page_data['headings'].append({
'level': i,
'text': heading.get_text(strip=True)
})
results['page_data'].append(page_data)
results['text'] += f"\n\n--- {page_title} ---\n{page_text}"
# If this is the first page, update metadata
if len(results['page_data']) == 1:
results['metadata']['title'] = page_title
results['metadata']['description'] = page_description
# Extract links if we haven't reached max depth
if current_depth < max_depth:
for link in soup.find_all('a', href=True):
href = link['href']
if href.startswith('#') or href.startswith('javascript:'):
continue
absolute_url = normalize_url(href, current_url)
if absolute_url and validate_url(absolute_url):
# Only follow links to the same domain
if urlparse(absolute_url).netloc == urlparse(url).netloc:
if absolute_url not in visited:
to_visit.append((absolute_url, current_depth + 1))
results['links'].append({
'url': absolute_url,
'text': link.get_text(strip=True),
'depth': current_depth + 1
})
except Exception as e:
logging.error(f"Error crawling {current_url}: {str(e)}")
continue
# Update metadata
results['metadata']['pages_crawled'] = len(visited)
results['metadata']['crawl_time'] = round(time.time() - start_time, 2)
return results