-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathshoeScraper.py
More file actions
108 lines (92 loc) · 3.16 KB
/
shoeScraper.py
File metadata and controls
108 lines (92 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# shoeScraper - singleShoe
# an automated tool for discovering and retrieving data
# on available merchandise from a specified distributors
import time
import pandas
from bs4 import BeautifulSoup
from selenium import webdriver
# import libraries used in this sequence
# create a selenium session with firefox
# open the assigned url
mainPage = 'http://mezlan.com/mezlan'
driver = webdriver.Firefox()
driver.get(mainPage)
# scroll down for all the items to load
driver.execute_script("window.scrollTo(0, 150000)")
time.sleep(3)
driver.execute_script("window.scrollTo(0, 300000)")
time.sleep(3)
driver.execute_script("window.scrollTo(0, 450000)")
time.sleep(3)
driver.execute_script("window.scrollTo(0, 600000)")
time.sleep(3)
driver.execute_script("window.scrollTo(0, 800000)")
time.sleep(3)
driver.execute_script("window.scrollTo(0, 1000000)")
time.sleep(3)
# if there is a link to continue loading, we should click that now.
if driver.find_element_by_partial_link_text('Load more items'):
print('Loading more items')
driver.find_element_by_partial_link_text('Load more items').click()
else:
print('Scrolling down more')
# then see if there's more to load, just in case.
driver.execute_script("window.scrollTo(0, 1200000)")
time.sleep(3)
driver.execute_script("window.scrollTo(0, 1400000)")
time.sleep(3)
# parse the generated html using beautifulsoup4 and lxml
soup = BeautifulSoup(driver.page_source, 'lxml')
surls = soup.find_all('a', class_="product-image")
# set up a few empty arrays from which we will copy to a csv later on
links = []
names = []
prices = []
styles = []
items = []
# loops pulling the URL for each shoe into a list
for i in surls:
links.append(i.get('href'))
# for each found url, we open a page and pull the needed information
for link in links:
driver.get(link)
subSoup = BeautifulSoup(driver.page_source, 'lxml')
name = subSoup.find('span', class_='h1')
nameClean = name.text
price = subSoup.find('span', class_="price")
priceClean = price.text
style = subSoup.find('div', class_="product-style")
styleClean = style.text
names.append(nameClean)
prices.append(priceClean)
styles.append(styleClean)
'''
# todo:
#
# get colors
swatch_tags = []
for color in color_tags:
color.click()
color_str = []
item['color'] = color_str
singleColor = subSoup.find('div', class_="input-box").ul.li.img['alt']
# get sizes
size_width_list = []
size_tags = []
for size in size_tags:
size.click()
size_str = []
# get widths
width_tags = []
for width in width_tags:
width.click()
width_str = []
availability_tag = []
size_width_list.append([size_str, width_str, availability_tag.text])
item['size_width_list'] = size_width_list
items.append(item)
return items
'''
# after each page has been processed, we append each list of variables to a new column in a csv
df = pandas.DataFrame(data={"Name": names, "Price": prices, "Style Number": styles}) # "SKU": items,
df.to_csv("./mezlan.csv", sep=",", index="False")