-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper2.py
More file actions
78 lines (63 loc) · 1.93 KB
/
scraper2.py
File metadata and controls
78 lines (63 loc) · 1.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from gevent import monkey
monkey.patch_all()
from lxml import html
import requests
import re
import json
import gevent.pool
import gevent.queue
from datetime import datetime
pool = gevent.pool.Pool(32)
queue = gevent.queue.Queue()
session = requests.Session()
a = 0
start_time = datetime.now()
json_dict = {}
url = 'http://www.pgshop.com/pgshop-hers/'
regex = re.compile(r"t?([\w]{2,}(?:(?: [\w&]+)?)+)\\")
def worker():
while True:
try:
url, tag = queue.get_nowait()
scrape(url, tag)
except gevent.queue.Empty:
return
def scrape(url, tag):
global a
try:
r = session.get(url)
except Exception as e:
print 'ERROR - ', str(e)
tree = html.fromstring(r.text)
products = tree.xpath('//p[@class="product-name"]')
imgs = tree.xpath('//img[contains(@class,"product-image")]//@src')
for i, pd in enumerate(products):
p = pd.xpath('.//text()')[1]
link = pd.xpath('.//@href')[0]
result = re.sub(r"\\t|\\n", '', repr(p))
if result != '':
print 'Found [%s][%s] in %s' % (result, link, tag)
a += 1
if tag in json_dict:
json_dict[tag].append((result, link, imgs[i]))
else:
json_dict[tag] = list()
json_dict[tag].append((result, link, imgs[i]))
r = session.get(url)
tree = html.fromstring(r.text)
a_tags = tree.xpath('//li[@class="menu-item"]//a')
tags = [(x.xpath('.//@href'), repr(x.xpath('.//text()'))) for x in a_tags]
for t in tags:
url = t[0]
result = regex.findall(t[1])
# print url, result
# scrape(url[0], result[0])
queue.put((url[0], result[0]))
while not queue.empty() and not pool.full():
for x in xrange(0, min(queue.qsize(), pool.free_count())):
pool.spawn(worker)
pool.join()
print a
print 'Time Taken : ', datetime.now() - start_time
with open('data.json', 'w') as fp:
json.dump(json_dict, fp)