-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_status.py
More file actions
119 lines (88 loc) · 2.94 KB
/
get_status.py
File metadata and controls
119 lines (88 loc) · 2.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import requests
from mongoengine import *
import pandas as pd
import os
import yagmail
import multiprocessing as mp
connect(
db='urlcheck_5',
)
import datetime
def geturllist():
df = pd.read_table('remove.txt', encoding="ISO-8859-1", names=['url'])
df.drop_duplicates()
return df['url'].tolist()
class Domain(DynamicDocument):
status_code = StringField()
#keyword = StringField()
url = StringField()
final_url = StringField()
#time = DateTimeField()
is_requests = BooleanField()
def get_status_code(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
r = requests.get(convert(url), headers=headers, timeout = 5)
# print(r.status_code)
return (str(r.status_code),str(r.url))
# prints the int of the status code. Find more at httpstatusrappers.com :)
except requests.ConnectionError:
#print("failed to connect")
return "failed to connect"
def getkeyword(url):
pass
def savestatus(url):
try:
if Domain.objects(url=url).first():
print('Already in Database')
else:
status = get_status_code(convert(url))[0]
final_url = get_status_code(convert(url))[1]
Domain(status_code=str(status),
# keyword='test1',
url=str(url),
final_url=final_url,
# time=datetime.datetime.now(),
is_requests=True).save()
print('{} : {} has saved \n'.format(str(status), str(url)))
except:
print('save fail')
def getcsv():
d = Domain.objects().all()
data = [(x.url, x.status_code,x.final_url,x.is_requests) for x in d]
df1 = pd.DataFrame(data, columns=['url', 'status_code','final_url','is_requests'])
df1.drop_duplicates()
df1.to_csv('urlcheck.csv')
return df1.shape
def convert(url):
if url.startswith('https://www.'):
return url
if url.startswith('http://www.'):
return 'http://' + url[len('http://www.'):]
if url.startswith('www.'):
return 'http://' + url[len('www.'):]
if not url.startswith('http://'):
return 'http://' + url
return url
@click.command()
@click.option('--dele', default='no_del',prompt='( del ) To Delete,( no_del ) To Hold On, Default no_del', help='del Or no_del' )
def deleted(dele):
if dele == 'del':
d = Domain.objects().all()
for x in d:
d.delete()
print('delete all records in database')
return run()
else:
return run()
def run():
urllist = geturllist()
pool = mp.Pool(processes=8)
p = pool.map(savestatus,urllist)
pool.close()
pool.join()
shape = getcsv()
if __name__ == '__main__':
deleted()