-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathAdvanced Webscaper.py
More file actions
132 lines (117 loc) · 4.39 KB
/
Advanced Webscaper.py
File metadata and controls
132 lines (117 loc) · 4.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import cloudscraper #This is to bypass cloudflare and firewalls
from flask import Flask, render_template #will be used to display the webscraped html file.
from urlextract import URLExtract #used to extract urls
#htptx is used to scrape aswell, alternative since it uses http2 which is modern.
import httpx
#Used to fake the browser in headers for httpx
from fake_useragent import UserAgent
# set UA to useragent var
ua = UserAgent()
#Payload being sent for httpx, Makes the request seem more real.
xheaders = {
"Accept":"*/*",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"en-US,en;q=0.9",
"Connection":"keep-alive",
"Referer":"https://www.google.com/",
"Sec-ch-perfers-color-scheme":"light",
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Fetch-Dest":"document",
"Sec-Fetch-Mode":"navigate",
"Sec-Fetch-Site":"none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests":"1",
"User-Agent":str(ua.chrome)
}
#set client as http2 for httpx
client = httpx.Client(http2=True)
#set extractor to url extract
extractor = URLExtract()
#set up path to HTML file where data will be stored using flask
app = Flask(__name__)
@app.route('/')
def index():
return render_template('site.html')
#Require user input for the URL
ssearch = input("Scrape search? [Y/N] ")
ssearch = ssearch.lower()
if ssearch == "y":
what = input("What do you want to search? ")
#replace spaces with + as google uses that
what = what.replace(" ", "+")
#replace modified google search with input
base = f"http://www.google.com/search?q={what}&gws_rd=ssl"
#request site with url and headers
http2 = client.get(base,headers=xheaders)
#save google results to html file
with open('./templates/site.html', 'w') as f:
f.write(http2.text)
urls = extractor.find_urls(http2.text)
#save links to file
with open('./templates/links.txt', 'w') as f:
f.write(str(urls))
#run site on port 8080
app.run('0.0.0.0',8080)
# if search option is no, ignore
elif ssearch == "n":
pass
#make sure input it Y and N only
else:
print("Y or N only!")
url = input("URL: ")
#Set the webscaper headers to a iphone chrome browser user agent, makes it seem real aswell. Also disable brotli, a compression system by cloudflare, makes the results less pretty so its disabled. ecdhCurve is setting the algorithm thats used for the blocking by clouflare, its currently set to a harder version incase the site your scraping is diffcult to scrape.
scraper = cloudscraper.create_scraper(
browser={
'browser': 'chrome',
'platform': 'ios',
'desktop': False
},
allow_brotli=False,
ecdhCurve='secp384r1'
)
#Gives option to use httpx2 or cloudscraper, if input is 1 it runs httpx2 if 2 it uses cloudscraper
print("httpx2 uses modern http2 requests and seems realistic, cloudscraper is used to bypass cloudflare and bypass anti-bot + decompress brotli")
print(" ")
h2orcf = input("httpx2 [1] or cloudscraper? [2]: ")
if h2orcf == "1":
print("Scraping site...[May take a bit]")
try:
http2 = client.get(url,headers=xheaders)
except:
print("An error occured")
pass
#save data to site.html
with open('./templates/site.html', 'w') as f:
f.write(http2.text)
#extract links from html
urls = extractor.find_urls(http2.text)
#save the links to file
with open('./templates/links.txt', 'w') as f:
f.write(str(urls))
print("Links saved to links.txt")
print("HTML saved to site.html")
#run site on port 8080
app.run('0.0.0.0',8080)
elif h2orcf == "2":
print("Scraping site...[May take a bit]")
try:
cf = scraper.get(url)
except:
print("An error occured")
pass
#save data to site.html
with open('./templates/site.html', 'w') as f:
f.write(cf.text)
#extract links from html
urls = extractor.find_urls(cf.text)
#save the links to file
with open('./templates/links.txt', 'w') as f:
f.write(str(urls))
print("Links saved to links.txt")
print("HTML saved to site.html")
#run site on port 8080
app.run('0.0.0.0',8080)
else:
print("Select 1 or 2 only!")
#annnnd were done! here it is, the webscraper that scrapes a site, downloads the data, bypasses cloudflare, decompresses it, runs it as html file, extracts and links and saves, then gives preview of the site you just scraped!
# A powerful yet simple web scraper able to bypass Cloudflare, uses HTTP 2.0 and headers to emulate real person, also serve's the saved HTML file and stores the links.