-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler.py
More file actions
109 lines (88 loc) · 3.98 KB
/
crawler.py
File metadata and controls
109 lines (88 loc) · 3.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import argparse
from urllib.parse import urlparse, parse_qs
def get_gallery_id_from_url(url):
"""Extracts the gallery ID from the URL's query string."""
try:
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)
return query_params.get('id', [None])[0]
except Exception:
return 'gallery'
def crawl_gallery(base_url, page_range, search_word=None):
"""
Crawls a DCInside gallery for a given URL and page range.
Args:
base_url (str): The base URL of the gallery list.
page_range (tuple): A tuple containing the start and end page numbers.
search_word (str, optional): A word to filter titles by. Defaults to None.
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
all_posts = []
start_page, end_page = page_range
for page_num in range(start_page, end_page + 1):
# The page parameter might already be in the URL, so we handle that.
paginated_url = f"{base_url}&page={page_num}"
print(f"Crawling page: {paginated_url}")
try:
response = requests.get(paginated_url, headers=headers)
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Error fetching page {page_num}: {e}")
continue
soup = BeautifulSoup(response.text, 'html.parser')
for tr in soup.select('tbody > tr'):
try:
num = tr.select_one('.gall_num').text.strip()
if not num.isdigit():
continue
title_element = tr.select_one('.gall_tit a')
title = title_element.text.strip()
# --- Search Word Filter ---
if search_word and search_word not in title:
continue
link = 'https://gall.dcinside.com' + title_element['href']
author = tr.select_one('.gall_writer').text.strip()
views = tr.select_one('.gall_count').text.strip()
liked = tr.select_one('.gall_recommend').text.strip()
all_posts.append({
'Number': num,
'Title': title,
'Author': author,
'Views': views,
'Link': link,
'Liked': liked
})
except (AttributeError, TypeError):
continue
time.sleep(1)
if not all_posts:
print("No valid posts found across the specified pages with the given criteria.")
return
gallery_id = get_gallery_id_from_url(base_url)
df = pd.DataFrame(all_posts)
excel_filename = f"{gallery_id}.xlsx"
df.to_excel(excel_filename, index=False)
print(f"Saved {len(all_posts)} posts to {excel_filename}")
def main():
parser = argparse.ArgumentParser(description="Crawl a DCInside gallery and save posts to an Excel file.")
parser.add_argument('-l', '--link', required=True, help="The full URL of the gallery board list. E.g., 'https://gall.dcinside.com/mgallery/board/lists/?id=record'")
parser.add_argument('-p', '--pages', required=True, help="The range of pages to crawl. E.g., '1-5'")
parser.add_argument('-S', '--search-word', required=False, help="An optional word to search for in post titles.")
args = parser.parse_args()
try:
start_page, end_page = map(int, args.pages.split('-'))
if start_page <= 0 or end_page < start_page:
raise ValueError("Page range must be positive and in increasing order (e.g., 1-5).")
except ValueError as e:
print(f"Error: Invalid page range format. {e}")
return
print("Running crawler...")
crawl_gallery(args.link, (start_page, end_page), args.search_word)
if __name__ == "__main__":
main()