-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweb_scraper.py
More file actions
170 lines (128 loc) · 6.19 KB
/
web_scraper.py
File metadata and controls
170 lines (128 loc) · 6.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python3
"""
Web scraper to download tutorial web pages from tutorialspoint.com
"""
import os
import re
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import markdownify
def download_tutorial_pages():
"""
Downloads tutorial pages from https://www.tutorialspoint.com/batch_script/index.htm
and all pages starting with https://www.tutorialspoint.com/batch_script/
"""
base_url = "https://www.tutorialspoint.com/batch_script/index.htm"
visited_urls = set()
urls_to_visit = [base_url]
# Create directories if they don't exist
os.makedirs("docs/www", exist_ok=True)
os.makedirs("docs/imgs", exist_ok=True)
while urls_to_visit:
current_url = urls_to_visit.pop(0)
if current_url in visited_urls:
continue
print(f"Processing: {current_url}")
try:
response = requests.get(current_url)
response.raise_for_status()
# Save the raw HTML
filename = os.path.basename(urlparse(current_url).path) or "index.htm"
with open(f"docs/www/{filename}", "w", encoding="utf-8") as f:
f.write(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
# Find and download images
for img_tag in soup.find_all('img'):
img_src = img_tag.get('src')
if img_src:
img_url = urljoin(current_url, img_src)
download_image(img_url)
# Look for the next button with id 'btn_top_next'
next_button = soup.find('div', id='btn_top_next')
if next_button:
# Find the parent link or determine the next page URL
# Usually, these buttons are within an anchor tag or have a data attribute
parent_link = next_button.find_parent('a')
if parent_link and parent_link.get('href'):
next_url = urljoin(current_url, parent_link['href'])
if next_url.startswith("https://www.tutorialspoint.com/batch_script/"):
if next_url not in visited_urls:
urls_to_visit.append(next_url)
# Also look for other links within the batch_script domain
for link in soup.find_all('a', href=True):
link_url = urljoin(current_url, link['href'])
if link_url.startswith("https://www.tutorialspoint.com/batch_script/"):
if link_url not in visited_urls and link_url not in urls_to_visit:
urls_to_visit.append(link_url)
# Convert HTML to markdown
convert_html_to_markdown(filename)
visited_urls.add(current_url)
except requests.RequestException as e:
print(f"Error downloading {current_url}: {e}")
continue
print(f"Downloaded {len(visited_urls)} pages.")
def download_image(img_url):
"""
Download an image from the given URL and save it to the images directory
"""
try:
response = requests.get(img_url)
response.raise_for_status()
img_filename = os.path.basename(urlparse(img_url).path)
if not img_filename:
img_filename = "image_" + str(hash(img_url)) + ".jpg"
# Ensure unique filename
counter = 1
original_filename = img_filename
while os.path.exists(f"docs/imgs/{img_filename}"):
name, ext = os.path.splitext(original_filename)
img_filename = f"{name}_{counter}{ext}"
counter += 1
with open(f"docs/imgs/{img_filename}", "wb") as f:
f.write(response.content)
print(f"Downloaded image: {img_filename}")
# Update image references in HTML files to point to local images
update_image_references(original_filename, img_filename)
except requests.RequestException as e:
print(f"Error downloading image {img_url}: {e}")
def update_image_references(old_filename, new_filename):
"""
Updates HTML files to reference the local image instead of the remote one
"""
for html_file in os.listdir("docs/www/"):
if html_file.endswith(".htm") or html_file.endswith(".html"):
with open(f"docs/www/{html_file}", "r", encoding="utf-8") as f:
content = f.read()
# Replace image references
updated_content = content.replace(old_filename, f"../imgs/{new_filename}")
if updated_content != content:
with open(f"docs/www/{html_file}", "w", encoding="utf-8") as f:
f.write(updated_content)
def convert_html_to_markdown(html_filename):
"""
Convert HTML file to markdown and save in docs/ directory
"""
try:
with open(f"docs/www/{html_filename}", "r", encoding="utf-8") as f:
html_content = f.read()
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Extract the main content (usually in a specific div)
# This might need to be adjusted based on the actual page structure
main_content = soup.find('div', class_='content') or soup.find('div', id='content') or soup.find('div', class_='intro') or soup
# Convert to markdown
markdown_content = markdownify.markdownify(str(main_content), heading_style="ATX")
# Create docs directory if it doesn't exist
os.makedirs("docs", exist_ok=True)
# Change extension to .md
md_filename = os.path.splitext(html_filename)[0] + ".md"
with open(f"docs/{md_filename}", "w", encoding="utf-8") as f:
f.write(markdown_content)
print(f"Converted {html_filename} to {md_filename}")
except Exception as e:
print(f"Error converting {html_filename} to markdown: {e}")
if __name__ == "__main__":
download_tutorial_pages()