web_crawler_ptt/multi_Thread.py at main · minano430/web_crawler_ptt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 13 10:36:27 2019

@author: 曾嘉鴻
"""
import requests
from PyQt5 import QtWidgets, QtCore
from bs4 import BeautifulSoup
import os
import re
import urllib.request
import json


class Thread(QtCore.QThread):

    update = QtCore.pyqtSignal(int)

    def __init__(self):
        QtCore.QThread.__init__(self)
        self.date_string = ""
        self.Thread_count = 0
        self.like_count = 0
        self.download_complete = 0
        self.progress_bar = QtWidgets.QProgressBar()

    def __del__(self):
        self.wait()

    def set(self, date_string, like_count, Thread_count):

        self.date_string = date_string
        self.like_count = like_count
        self.Thread_count = Thread_count


    def run(self):

        self.progress_bar.setValue(0)
        url = 'https://www.ptt.cc'
        current_page = self.get_web_page(url + '/bbs/Beauty/index.html')

        if current_page:
            articles = []  # 全部的今日文章
            current_articles, prev_url = self.get_articles(current_page, self.date_string, 0)  # 目前頁面的今日文章
            while current_articles:
                # 若目前頁面有今日文章則加入 articles，並回到上一頁繼續尋找是否有今日文章
                articles += current_articles
                current_page = self.get_web_page(url + prev_url)
                current_articles, prev_url = self.get_articles(current_page, self.date_string, 1)

            # 已取得文章列表，開始進入各文章讀圖
            for article in articles:
                if article['push_count'] >=  self.like_count :
                    page = self.get_web_page(url + article['href'])

                    if page:
                        img_urls = self.parse(page)
                        self.save(img_urls, article['title'])
                        article['num_images'] = len(img_urls)
                        print("Downloading ", article)

                    self.download_complete += 1
                    self.progress_bar.setValue((self.download_complete/len(articles))*100)
                else:
                    self.download_complete += 1
                    self.progress_bar.setValue((self.download_complete/len(articles))*100)

            print(article ," Download complete!")

            # 儲存文章資訊
            with open('data.json', 'w', encoding='utf-8') as f:
                json.dump(articles, f, indent=2, sort_keys=True, ensure_ascii=False)

    def get_articles(self,_dom, _date, _interger):

        soup = BeautifulSoup(_dom, 'html.parser')
        url = 'https://www.ptt.cc'
        # 取得上一頁的連結
        paging_div = soup.find('div', 'btn-group btn-group-paging')
        prev_url = paging_div.find_all('a')[1]['href']

        articles = []  # 儲存取得的文章資料
        divs = soup.find_all('div', 'r-ent')
        print("date_input",_date)

        for d in divs:
            print(d.find('div', 'date').string.strip())
            if d.find('div', 'date').string.strip() == _date:  # 發文日期正確
                if d.find('a'):  # 有超連結，表示文章存在，未被刪除
                    if d.find('a').string.find("公告") == -1:     #不抓取公告
                        # 取得推文數
                        push_count = 0
                        if d.find('div', 'nrec').string == "爆":
                            push_count = 100
                        elif d.find('div', 'nrec').string:
                            try:
                                push_count = int(d.find('div', 'nrec').string)  # 轉換字串為數字
                            except ValueError:  # 若轉換失敗，不做任何事，push_count 保持為 0
                                pass

                        # 取得文章連結及標題
                        href = d.find('a')['href']
                        title = d.find('a').string
                        articles.append({
                            'title': title,
                            'href': href,
                            'push_count': push_count,
                            'num_images' : 0
                        })
        if articles:
            return articles, prev_url
        elif _interger == 0:
            current_page = self.get_web_page(url + prev_url)
            current_articles, prev_url = self.get_articles(current_page, _date, 0)
            return current_articles, prev_url
        else:
            return articles, prev_url

    def get_web_page(self,url):

        resp = requests.get(
            url=url,
            cookies={'over18': '1'}
        )
        if resp.status_code != 200:
            print('Invalid url:', resp.url)
            return None
        else:
            return resp.text

    def parse(self,_dom):
        soup = BeautifulSoup(_dom, 'html.parser')
        links = soup.find(id='main-content').find_all('a')
        img_urls = []
        for link in links:
            if re.match(r'^https?://(i.)?(m.)?imgur.com', link['href']):
                img_urls.append(link['href'])
        return img_urls


    def save(self,img_urls, title):
        if img_urls:
            try:
                path = 'C:/Users/user/Desktop/練習圖片/'
                directory_name = title.strip()  # 用 strip() 去除字串前後的空白
                os.makedirs(path + directory_name)
                for img_url in img_urls:
                    if img_url.split('//')[1].startswith('m.'):
                        img_url = img_url.replace('//m.', '//i.')
                    if not img_url.split('//')[1].startswith('i.'):
                        img_url = img_url.split('//')[0] + '//i.' + img_url.split('//')[1]
                    if not img_url.endswith('.jpg'):
                        img_url += '.jpg'
                    fname = img_url.split('/')[-1]
                    urllib.request.urlretrieve(img_url, path + os.path.join(directory_name, fname))
            except Exception as e:
                print(e)