-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathASIN_Parser.py
More file actions
88 lines (70 loc) · 2.81 KB
/
ASIN_Parser.py
File metadata and controls
88 lines (70 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#ASIN_Parser
#1. User gives Merchant ID
#2. Download webpage of listings
#3. Get first set of ASINs and number of pages of listings for that merchant
#4. Cycle though the rest of the pages and take their ASINs
import re
import urllib.request
import argparse
import time
#webpage_dl - takes url details and downloads the webpage
#then passes it to asin_parse to analyze
def webpage_dl(url,merchantID_url_tag,merchantID):
webpage = urllib.request.urlopen(url+merchantID_url_tag+merchantID)
webContent = webpage.read().decode('utf-8')
pageCount = asin_parse(webContent,merchantID)
return(pageCount)
#asin_parse - takes in webpage, spits out ASINs
def asin_parse(webContent,merchantID):
#open and append to ASIN list file
filename = 'ASIN_List_'+merchantID
f = open(filename + ".txt", 'a')
#find all asin numbers listed on webpage
asin_pattern1 = 'asin\=\"\w\w\w\w\w\w\w\w\w\w'
asin_pattern2 = '\w\w\w\w\w\w\w\w\w\w'
#search downloaded webpage for ASIN tag
asin_list = re.findall(asin_pattern1,webContent)
#write each asin to file, removing the "asin=" part
#of the string from previous regex, using the 2nd pattern
if asin_list != []:
for i in asin_list:
f.write(re.findall(asin_pattern2,i)[0]+'\n')
else:
print('Failed to find ASIN!\n')
return(-1)
#close file
f.close
#find number of seller pages for later iterating download
page_count_pattern1 = 'a\-disabled\"\>\d\d'
page_count_pattern2 = '\d\d'
page_count2 = []
#filter webpage text by regexes to find number of seller pages
page_count1 = re.findall(page_count_pattern1,webContent)
if page_count1 != []:
for j in page_count1:
page_count2.extend(re.findall(page_count_pattern2,j))
return(page_count2[0])
else:
print('Failed to find Page Count!\n')
return(-1)
def main():
#take in merchant ID as command line argument
arg_parser = argparse.ArgumentParser(description='List all ASINs from Amazon Seller')
arg_parser.add_argument("-m",type=str,help="Amazon Merchant ID #",required=True)
args = arg_parser.parse_args()
#merchantID is carried throughout functions because needed for naming file
merchantID = args.m
#initial page download writes first page of ASINs and retrieves page count
url = 'https://www.amazon.com/s?'
merchantID_url_tag='&merchant='
#call webpage_dl with url to retrieve page count
pageCount = webpage_dl(url,merchantID_url_tag,merchantID)
if (int(pageCount) < 0):
return(-1)
#now iterate through all the seller's pages
for i in range(2, int(pageCount)):
webpage_dl(url+'&page='+str(i),merchantID_url_tag,merchantID)
#have to wait or amazon cuts you off mid-run
time.sleep(1)
print('Operation Success!\n')
main()