From 83d2962e052f120f7a5e9882ca831de93cd7967b Mon Sep 17 00:00:00 2001 From: jerryning Date: Fri, 3 Nov 2017 11:30:37 +0800 Subject: [PATCH 1/4] first commit --- download.py | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kuaidi_1.py | 88 +++++++++++++++++++++++++++++++++++++++++ orderno.py | 86 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 284 insertions(+) create mode 100644 download.py create mode 100644 kuaidi_1.py create mode 100644 orderno.py diff --git a/download.py b/download.py new file mode 100644 index 0000000..4651420 --- /dev/null +++ b/download.py @@ -0,0 +1,110 @@ +import random +import requests + + +class Download: + """下载网页html""" + def __init__(self): + """ + 创建一个随机的请求头 + """ + self.user_agent_list = [ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", + "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", + "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", + "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36" + ] + self.head_connection = ['keep-alive'] + self.head_accept_language = ['zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'] + self.head_accept = ['text/css,*/*;q=0.1'] + # self.ip_list = ip.get_ip_list() + self.proxy = [ + '106.5.173.163:3276', + '118.117.138.173:2645', + '111.74.232.220:9756', + '60.17.248.204:2121', + '116.208.96.24:3154', + '100.18.21.221:8110', + '117.68.145.78:2644', + '182.111.49.213:4162', + '36.34.14.53:6436', + '114.226.135.105:9287', + '222.189.89.180:5638', + '36.34.15.96:6436', + '117.57.170.138:3852', + '112.85.10.250:1131', + '115.219.76.29:2316', + '60.173.24.251:6890', + '117.71.152.248:2319', + '106.110.249.222:3456', + '60.187.145.145:2315', + '117.90.2.47:3217', + '222.163.253.2:2862', + '123.189.48.142:9706', + '60.160.186.100:7654', + '36.33.18.1:6436', + '171.215.203.35:2645', + '59.62.194.171:6344', + '114.99.22.214:6890', + '111.77.20.64:4162', + '182.100.162.23:4162', + '60.168.23.241:2644', + '42.54.231.82:3529', + '115.153.104.137:2314', + '117.68.242.119:2644', + '106.5.5.120:9756', + '100.18.25.49:8110', + '182.111.98.113:2314', + '49.67.138.134:2137', + '117.68.242.186:2644', + '223.215.149.202:2319', + '175.151.220.99:1767', + '183.145.53.113:2315', + '117.90.2.51:3217', + '36.45.194.35:3215', + '123.152.37.190:2682', + '117.70.137.207:6436', + ] + + def get_url(self, url, timeout, num_retries=3): + """ + 构造请求头,并获取响应 + :param url: + :param timeout: + :return: + """ + UA = random.choice(self.user_agent_list) + headers = { + 'Connection': self.head_connection[0], + 'Accept': self.head_accept[0], + 'Acccept-Language': self.head_accept_language[0], + 'Use-Agent': UA, + } + ip = random.choice(self.proxy) + proxies = {'http': ip} + try: + response = requests.get(url, timeout=timeout, headers=headers, proxies=proxies) + except: + print("获取网页出错") + response = None + if num_retries > 0: + print('获取页面倒数第%s次' % num_retries) + return self.get_url(url, timeout, num_retries-1) + else: + return response \ No newline at end of file diff --git a/kuaidi_1.py b/kuaidi_1.py new file mode 100644 index 0000000..d81b377 --- /dev/null +++ b/kuaidi_1.py @@ -0,0 +1,88 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +from download import Download +from orderno import Orderno +import time +import datetime + + +class Kuaidu: + def __init__(self): + self.bash_url = 'http://www.kuaidi.com/index-ajaxselectcourierinfo-' + self.bash_html = '-.html' + self.orderno = Orderno() + + def data(self): + """ + 构造url链接 + :return: 实际的快递单信息链接 + """ + print('Begin') + orderno = self.orderno + orderno_list = orderno.shipping_orderno() + print('今天要爬取的采购单数目: %s' % len(orderno_list)) + for i in orderno_list: + url = self.bash_url + i + self.bash_html + self.get_text(i, url) + + def get_text(self, shippingorderno, url): + """ + 获取快递单信息 + :param shippingorderno: + :param url: 快递单链接列表 + :return: 快递单号、详细描述、交易时间 + """ + download = Download() + response = download.get_url(url, 5) + n = 0 + if response: + try: + j = response.json() + print("开始获取%s的信息 " % shippingorderno) + except: + pass + else: + for data in j['data']: + track_date = data['time'].strip() + description = data['context'].strip() + self.save_date(shippingorderno, description, track_date) + time.sleep(0.8) + n += 1 + print('已完成%s' % n) + + def save_date(self, shippingorderno, description, track_date): + """ + 保存至数据库 + :param shippingorderno: + :param description: + :param track_date: + :return: + """ + order = self.orderno + db = order.db + create_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + cursor = order.cursor + data = { + 'shippingorderno': shippingorderno, + 'description': description, + 'create_date': create_date, + 'track_date': track_date, + } + table = 'shippingtrackdetail' + keys = ', '.join(data.keys()) + values = ', '.join(['%s'] * len(data)) + sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values) + # 已存在的数据不再存入 + cursor.execute("select * from shippingtrackdetail WHERE shippingorderno='{}' " + "and description='{}'".format(shippingorderno, description)) + isExists = cursor.rowcount + if not isExists: + try: + cursor.execute(sql, tuple(data.values())) + print('%s:%s save successful' % (shippingorderno, description)) + db.commit() + except: + print('Failed') + db.rollback() + else: + print('%s:%s 已经存在' % (shippingorderno, description)) \ No newline at end of file diff --git a/orderno.py b/orderno.py new file mode 100644 index 0000000..5996910 --- /dev/null +++ b/orderno.py @@ -0,0 +1,86 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +import pymysql +import re + + +class Orderno(): + def __init__(self): + self.db = pymysql.connect(host='192.168.1.242', user='nijun', password='nijun', port=3306, db='pms', + charset='utf8') + self.cursor = self.db.cursor() + + def repleni_orderno(self): + """ + 获取shippingorderno单列表 + :return: + """ + sql = "select shippingorderno from purchasetask where stockinstatus != 'All Stock In' " \ + "and shippingorderno != '' and (purchasetask_process_step in ('Start','Under Supervisor Confirmation'," \ + "'Under Confirmation','Confirmed','Awaiting Products','Check In Products','Partial Check In') or " \ + "purchasetask_process_step is null) and DATE_SUB(CURDATE(), INTERVAL 30 DAY) <= purchase_date" \ + " ORDER BY purchase_date" + cursor = self.cursor + try: + cursor.execute(sql) + results = cursor.fetchall() + ls = [] + for i in range(len(results)): + if results[i][0]: + ls.append(results[i][0]) + replen = [] + for row in ls: + result = re.findall('[0-9]{8,}', row) + if result: + replen.append(result) + # 检测已完成的订单不再去查询 + repleni_orderno = [] + for m in replen: + for n in m: + sql = "SELECT * from shippingtrackdetail where shippingorderno='{}' " \ + "and description LIKE '%{}%'".format(n, '签收') + cursor.execute(sql) + isExists = cursor.rowcount + if not isExists: + repleni_orderno.append(n) + return repleni_orderno + except: + pass + + def shipping_orderno(self): + """ + 获取replenishmentorderno单列表,并将两个表合并 + :return: + """ + sql = "select replenishmentorderno from purchasetask where stockinstatus != 'All Stock In' " \ + "and shippingorderno != '' and (purchasetask_process_step in ('Start','Under Supervisor Confirmation'," \ + "'Under Confirmation','Confirmed','Awaiting Products','Check In Products','Partial Check In') or " \ + "purchasetask_process_step is null) and DATE_SUB(CURDATE(), INTERVAL 30 DAY) <= purchase_date " \ + "ORDER BY purchase_date;" + cursor = self.cursor + try: + cursor.execute(sql) + results = cursor.fetchall() + ls = [] + for i in range(len(results)): + if results[i][0]: + ls.append(results[i][0]) + replen = [] + for row in ls: + result = re.findall('[0-9]{8,}', row) + if result: + replen.append(result) + # 检测已完成的订单不再去查询 + orderno = self.repleni_orderno() + for m in replen: + for n in m: + sql = "SELECT * from shippingtrackdetail where shippingorderno='{}' " \ + "and description LIKE '%{}%'".format(n, '签收') + cursor.execute(sql) + isExists = cursor.rowcount + if not isExists: + orderno.append(n) + return orderno + except: + pass + From a6f76a2ea03cdd26fe26b42b18100b924a2bf7b0 Mon Sep 17 00:00:00 2001 From: Jerryning Date: Fri, 3 Nov 2017 11:47:52 +0800 Subject: [PATCH 2/4] Update orderno.py --- orderno.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/orderno.py b/orderno.py index 5996910..5e0267f 100644 --- a/orderno.py +++ b/orderno.py @@ -6,20 +6,16 @@ class Orderno(): def __init__(self): - self.db = pymysql.connect(host='192.168.1.242', user='nijun', password='nijun', port=3306, db='pms', + self.db = pymysql.connect(host='your dress', user='username', password='password', port=3306, db='', charset='utf8') self.cursor = self.db.cursor() def repleni_orderno(self): """ - 获取shippingorderno单列表 + 获取单列表 :return: """ - sql = "select shippingorderno from purchasetask where stockinstatus != 'All Stock In' " \ - "and shippingorderno != '' and (purchasetask_process_step in ('Start','Under Supervisor Confirmation'," \ - "'Under Confirmation','Confirmed','Awaiting Products','Check In Products','Partial Check In') or " \ - "purchasetask_process_step is null) and DATE_SUB(CURDATE(), INTERVAL 30 DAY) <= purchase_date" \ - " ORDER BY purchase_date" + sql = "" cursor = self.cursor try: cursor.execute(sql) @@ -37,8 +33,7 @@ def repleni_orderno(self): repleni_orderno = [] for m in replen: for n in m: - sql = "SELECT * from shippingtrackdetail where shippingorderno='{}' " \ - "and description LIKE '%{}%'".format(n, '签收') + sql = "" cursor.execute(sql) isExists = cursor.rowcount if not isExists: @@ -49,14 +44,10 @@ def repleni_orderno(self): def shipping_orderno(self): """ - 获取replenishmentorderno单列表,并将两个表合并 + 获取单列表,并将两个表合并 :return: """ - sql = "select replenishmentorderno from purchasetask where stockinstatus != 'All Stock In' " \ - "and shippingorderno != '' and (purchasetask_process_step in ('Start','Under Supervisor Confirmation'," \ - "'Under Confirmation','Confirmed','Awaiting Products','Check In Products','Partial Check In') or " \ - "purchasetask_process_step is null) and DATE_SUB(CURDATE(), INTERVAL 30 DAY) <= purchase_date " \ - "ORDER BY purchase_date;" + sql = "" cursor = self.cursor try: cursor.execute(sql) @@ -74,8 +65,7 @@ def shipping_orderno(self): orderno = self.repleni_orderno() for m in replen: for n in m: - sql = "SELECT * from shippingtrackdetail where shippingorderno='{}' " \ - "and description LIKE '%{}%'".format(n, '签收') + sql = "" cursor.execute(sql) isExists = cursor.rowcount if not isExists: From d0ae6246311502e391a682f3e3f4759565225029 Mon Sep 17 00:00:00 2001 From: Jerryning Date: Fri, 3 Nov 2017 11:49:15 +0800 Subject: [PATCH 3/4] Delete a.md --- a.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 a.md diff --git a/a.md b/a.md deleted file mode 100644 index e69de29..0000000 From 38dbb5bef01545dd171138c2f1ae9c5b9800582e Mon Sep 17 00:00:00 2001 From: Jerryning Date: Fri, 3 Nov 2017 13:39:51 +0800 Subject: [PATCH 4/4] Update kuaidi_1.py --- kuaidi_1.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kuaidi_1.py b/kuaidi_1.py index d81b377..5191766 100644 --- a/kuaidi_1.py +++ b/kuaidi_1.py @@ -34,7 +34,6 @@ def get_text(self, shippingorderno, url): """ download = Download() response = download.get_url(url, 5) - n = 0 if response: try: j = response.json() @@ -47,8 +46,6 @@ def get_text(self, shippingorderno, url): description = data['context'].strip() self.save_date(shippingorderno, description, track_date) time.sleep(0.8) - n += 1 - print('已完成%s' % n) def save_date(self, shippingorderno, description, track_date): """ @@ -85,4 +82,4 @@ def save_date(self, shippingorderno, description, track_date): print('Failed') db.rollback() else: - print('%s:%s 已经存在' % (shippingorderno, description)) \ No newline at end of file + print('%s:%s 已经存在' % (shippingorderno, description))