-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathemail_spider.py
More file actions
68 lines (59 loc) · 2.58 KB
/
email_spider.py
File metadata and controls
68 lines (59 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#-*-coding:utf-8-*-
import re
import requests
import sys
reload(sys)
html1 = requests.get('http://web.xidian.edu.cn/')
html1.encoding = 'utf-8'
#print html1.text
xy_html = re.search(u'<div class="left_title">按院系查找</div>(.*?)<div class="left_title">按姓氏首字母查找</div>',html1.text,re.S).group(1) #编码转换
#print xy_html
f1 = open(u'1.txt','w')
f2 = open(u'2.txt','w')
xy_url = re.findall('<a href="(.*?)" title',xy_html,re.S)
for per_xy_url in xy_url:
temp_xy_url = 'http://web.xidian.edu.cn/' + per_xy_url
#print temp_url
html2 = requests.get(temp_xy_url)
html2.encoding = 'utf-8'
#print html2.text
t_html = re.search(u'教师主页\(按字母排列\)</div>(.*?)</ul>',html2.text,re.S).group(1)
#print t_html
#t_url = re.findall('<a href="(.*?)" title',t_html,re.S)
t_url = re.findall('<a href="(.*?)</li>',t_html,re.S)
for per_t_url in t_url:
#print per_t_url
per_t_url1 = re.search('/(.*?)"',per_t_url,re.S).group(1)
per_t_name = re.search('>(.*?)</a>',per_t_url,re.S).group(1)
#print per_t_name
#print per_t_url1
temp_t_url = 'http://web.xidian.edu.cn/' + per_t_url1
#print temp_t_url
html3 = requests.get(temp_t_url)
html3.encoding = 'utf-8'
email = re.search(u'电子邮箱:(.*?)@xidian.edu.cn',html3.text,re.S)
#print email
if email :
temp_email = email.group(1) + '@xidian.edu.cn\n'
email1 = re.search('mailto:(.*?)@xidian.edu.cn\n',temp_email,re.S)
f1.write(per_t_name.encode('utf-8') + '\t')
if email1 :
temp_email = email1.group(1) + '@xidian.edu.cn\n'
# email1 = re.search(' (.*?)@xidian.edu.cn',temp_email,re.S)
# if email1 :
# temp_email = email1.group(1) + '@xidian.edu.cn\n'
f1.write(temp_email.encode('utf-8'))
else :
email = re.search(u'电子邮箱:(.*?)@mail.xidian.edu.cn',html3.text,re.S)
if email :
temp_email = email.group(1) + '@mail.xidian.edu.cn\n'
f2.write(per_t_name.encode('utf-8') + '\t')
email1 = re.search('mailto:(.*?)@mail.xidian.edu.cn',temp_email,re.S)
if email1 :
temp_email = email1.group(1) + '@mail.xidian.edu.cn\n'
# email1 = re.search(' (.*?)@mail.xidian.edu.cn',temp_email,re.S)
# if email1 :
# temp_email = email1.group(1) + '@mail.xidian.edu.cn\n'
f2.write(temp_email.encode('utf-8'))
f1.close()
f2.close()