-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfetcher.py
More file actions
135 lines (113 loc) · 5.93 KB
/
fetcher.py
File metadata and controls
135 lines (113 loc) · 5.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# fetcher.py
import os, random, asyncio, traceback
from typing import Any
from playwright.sync_api import sync_playwright
from config import PROXIES, TIMEOUT
def get_random_proxy():
"""Select a random proxy from config."""
if not PROXIES:
return None
return random.choice(PROXIES)
def fetch_page_with_capture(url, screenshot_path, site_dir, proxy=None, timeout=TIMEOUT, retries=2, enable_trace=True):
"""
Fetches a webpage, saves a screenshot and trace (optional), and returns HTML + captured JS/XHR info.
"""
html_content = ""
saved_resources = {"js": [], "xhr": [], "all_requests": []}
last_error = None
# Use random proxy if not explicitly provided
proxy = proxy or get_random_proxy()
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
for attempt in range(1, retries + 2):
try:
with sync_playwright() as p:
browser_args: dict[str, Any] = {"headless": True}
# Configure proxy if available
if proxy:
browser_args["proxy"] = {"server": proxy}
print(f"[NetBear] Using proxy: {proxy}")
browser = p.chromium.launch(**browser_args)
context = browser.new_context()
trace_path = os.path.join(site_dir, "trace.zip")
if enable_trace:
context.tracing.start(screenshots=True, snapshots=True, sources=True)
page = context.new_page()
page.set_default_timeout(timeout)
# Set a common user-agent for stealth
page.set_extra_http_headers({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
})
# Capture JS and XHR requests
def on_response(response):
try:
req = response.request
req_info = {
"method": req.method,
"url": req.url,
"headers": req.headers,
"postData": req.post_data,
"responseStatus": response.status,
"responseStatusText": response.status_text,
"responseHeaders": response.headers
}
saved_resources["all_requests"].append(req_info)
ct = response.headers.get("content-type", "")
# High-interest keywords for this domain/target
HIGH_INTEREST_KEYWORDS = [
"appointment", "booking", "patient", "doctor", "slot",
"availability", "calendar", "prescription", "id"
]
if "javascript" in ct:
body = response.text()
# Determine interest level
interest_tag = "normal"
if any(kw in body.lower() for kw in HIGH_INTEREST_KEYWORDS):
interest_tag = "HIGH_INTEREST"
if any(kw in body.lower() for kw in ["eval", "new Function", "innerHTML", "token"]):
interest_tag = "CRITICAL"
js_path = os.path.join(site_dir, f"res_{len(saved_resources['js'])}-{interest_tag}.js")
with open(js_path, "w", encoding="utf-8") as f:
f.write(body)
saved_resources["js"].append({
"url": response.url,
"path": js_path,
"tag": interest_tag,
"size": len(body)
})
elif "json" in ct or "xhr" in response.url:
body = response.text()
xhr_path = os.path.join(site_dir, f"xhr_{len(saved_resources['xhr'])}.json")
with open(xhr_path, "w", encoding="utf-8") as f:
f.write(body)
saved_resources["xhr"].append({"url": response.url, "path": xhr_path})
except Exception:
pass
page.on("response", on_response)
try:
page.goto(url)
except PlaywrightTimeoutError as te:
print(f"[Attempt {attempt}] ❌ Timeout fetching {url}: {te}")
last_error = ("timeout", str(te))
browser.close()
continue
except Exception as e:
print(f"[Attempt {attempt}] ❌ Error during page.goto for {url}: {e}")
last_error = ("goto_error", str(e))
browser.close()
continue
html_content = page.content()
page.screenshot(path=screenshot_path)
if enable_trace:
context.tracing.stop(path=trace_path)
browser.close()
print(f"[NetBear] Fetched {url} ✅")
return html_content, saved_resources
except PlaywrightTimeoutError as te:
print(f"[Attempt {attempt}] ❌ Timeout fetching {url}: {te}")
last_error = ("timeout", str(te))
except Exception as e:
print(f"[Attempt {attempt}] ❌ Error fetching {url}: {e}")
traceback.print_exc()
last_error = ("exception", str(e))
print(f"[NetBear] Failed to fetch {url} after {retries+1} attempts. Last error: {last_error}")
return {"error": last_error, "url": url}, saved_resources