VintedScraper/utils.py at main · kami4ka/VintedScraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""
Utility functions for Vinted scraper.
"""

import re
from typing import Optional, Tuple
from urllib.parse import urlencode, urljoin

from config import BASE_URL, CATALOG_URL, CATEGORIES


def build_search_url(
    search_text: str = "",
    category: str = "",
    page: int = 1,
    min_price: Optional[float] = None,
    max_price: Optional[float] = None,
    condition: Optional[str] = None,
    order: str = "relevance"
) -> str:
    """Build a Vinted catalog search URL with filters."""
    params = {}

    if search_text:
        params["search_text"] = search_text

    if category and category in CATEGORIES:
        params["catalog[]"] = CATEGORIES[category]["id"]

    if page > 1:
        params["page"] = page

    if min_price is not None:
        params["price_from"] = min_price

    if max_price is not None:
        params["price_to"] = max_price

    if condition:
        params["status[]"] = condition

    if order and order != "relevance":
        params["order"] = order

    query_string = urlencode(params, doseq=True)
    return f"{CATALOG_URL}?{query_string}" if query_string else CATALOG_URL


def parse_alt_text(alt_text: str) -> dict:
    """
    Parse the image alt text to extract structured data.

    Format examples:
    - "Title, brand: Brand, condition: Condition, size: Size, $10.00, $12.00 includes Buyer Protection"
    - "Title, brand: Brand, condition: Condition, $10.00, $12.00 includes Buyer Protection"
    """
    result = {
        "title": "",
        "brand": "",
        "condition": "",
        "size": "",
        "price": "",
        "price_with_protection": ""
    }

    if not alt_text:
        return result

    # Extract brand
    brand_match = re.search(r'brand:\s*([^,]+)', alt_text)
    if brand_match:
        result["brand"] = brand_match.group(1).strip()

    # Extract condition
    condition_match = re.search(r'condition:\s*([^,]+)', alt_text)
    if condition_match:
        result["condition"] = condition_match.group(1).strip()

    # Extract size
    size_match = re.search(r'size:\s*([^,]+)', alt_text)
    if size_match:
        result["size"] = size_match.group(1).strip()

    # Extract prices - format: $X.XX, $Y.YY includes Buyer Protection
    price_match = re.search(r'\$([0-9,.]+),\s*\$([0-9,.]+)\s*includes?\s*Buyer\s*Protection', alt_text)
    if price_match:
        result["price"] = f"${price_match.group(1)}"
        result["price_with_protection"] = f"${price_match.group(2)}"

    # Extract title (everything before "brand:")
    title_match = re.match(r'^(.+?),\s*brand:', alt_text)
    if title_match:
        result["title"] = title_match.group(1).strip()

    return result


def extract_item_id(url: str) -> str:
    """Extract item ID from Vinted URL."""
    # URL format: /items/1234567890-item-slug
    match = re.search(r'/items/(\d+)', url)
    return match.group(1) if match else ""


def extract_favorites_count(aria_label: str) -> int:
    """Extract favorites count from button aria-label."""
    # Format: "Add to favorites, favorited by X users"
    match = re.search(r'favorited by (\d+)', aria_label)
    return int(match.group(1)) if match else 0


def make_absolute_url(url: str) -> str:
    """Convert relative URL to absolute URL."""
    if url.startswith("http"):
        return url
    return urljoin(BASE_URL, url)


def clean_price(price_str: str) -> str:
    """Clean and format price string."""
    if not price_str:
        return ""
    # Remove extra whitespace and normalize
    return price_str.strip()