-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
124 lines (94 loc) · 3.45 KB
/
utils.py
File metadata and controls
124 lines (94 loc) · 3.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""
Utility functions for Vinted scraper.
"""
import re
from typing import Optional, Tuple
from urllib.parse import urlencode, urljoin
from config import BASE_URL, CATALOG_URL, CATEGORIES
def build_search_url(
search_text: str = "",
category: str = "",
page: int = 1,
min_price: Optional[float] = None,
max_price: Optional[float] = None,
condition: Optional[str] = None,
order: str = "relevance"
) -> str:
"""Build a Vinted catalog search URL with filters."""
params = {}
if search_text:
params["search_text"] = search_text
if category and category in CATEGORIES:
params["catalog[]"] = CATEGORIES[category]["id"]
if page > 1:
params["page"] = page
if min_price is not None:
params["price_from"] = min_price
if max_price is not None:
params["price_to"] = max_price
if condition:
params["status[]"] = condition
if order and order != "relevance":
params["order"] = order
query_string = urlencode(params, doseq=True)
return f"{CATALOG_URL}?{query_string}" if query_string else CATALOG_URL
def parse_alt_text(alt_text: str) -> dict:
"""
Parse the image alt text to extract structured data.
Format examples:
- "Title, brand: Brand, condition: Condition, size: Size, $10.00, $12.00 includes Buyer Protection"
- "Title, brand: Brand, condition: Condition, $10.00, $12.00 includes Buyer Protection"
"""
result = {
"title": "",
"brand": "",
"condition": "",
"size": "",
"price": "",
"price_with_protection": ""
}
if not alt_text:
return result
# Extract brand
brand_match = re.search(r'brand:\s*([^,]+)', alt_text)
if brand_match:
result["brand"] = brand_match.group(1).strip()
# Extract condition
condition_match = re.search(r'condition:\s*([^,]+)', alt_text)
if condition_match:
result["condition"] = condition_match.group(1).strip()
# Extract size
size_match = re.search(r'size:\s*([^,]+)', alt_text)
if size_match:
result["size"] = size_match.group(1).strip()
# Extract prices - format: $X.XX, $Y.YY includes Buyer Protection
price_match = re.search(r'\$([0-9,.]+),\s*\$([0-9,.]+)\s*includes?\s*Buyer\s*Protection', alt_text)
if price_match:
result["price"] = f"${price_match.group(1)}"
result["price_with_protection"] = f"${price_match.group(2)}"
# Extract title (everything before "brand:")
title_match = re.match(r'^(.+?),\s*brand:', alt_text)
if title_match:
result["title"] = title_match.group(1).strip()
return result
def extract_item_id(url: str) -> str:
"""Extract item ID from Vinted URL."""
# URL format: /items/1234567890-item-slug
match = re.search(r'/items/(\d+)', url)
return match.group(1) if match else ""
def extract_favorites_count(aria_label: str) -> int:
"""Extract favorites count from button aria-label."""
# Format: "Add to favorites, favorited by X users"
match = re.search(r'favorited by (\d+)', aria_label)
return int(match.group(1)) if match else 0
def make_absolute_url(url: str) -> str:
"""Convert relative URL to absolute URL."""
if url.startswith("http"):
return url
return urljoin(BASE_URL, url)
def clean_price(price_str: str) -> str:
"""Clean and format price string."""
if not price_str:
return ""
# Remove extra whitespace and normalize
return price_str.strip()