Potatochan789.github.io/scraper.py at main · Potatochan789/Potatochan789.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import requests
import sys
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
import json
import time

urlList = [
    "https://www.smogon.com/forums/threads/umpl-v-replays-and-usage-stats.3771911/"
]

def get_team_contents(url: str):
    headers = {"User-Agent": "PostmanRuntime/7.32.3"}
    driver = {"headers": [headers]}
    req = requests.get(f"{url}.log", headers=headers)

    if req.status_code != 200:
        return None

    data = {}

    for line in req.content.decode("utf-8").split(sep="\n"):
        parsed = line.split(sep="|")
        if len(parsed) < 2:
            continue
        if parsed[1] == "player":
            tqdm.write(f"Found player {parsed[2]} with name {parsed[3]}")
            if parsed[2] in data:
                continue

            data[parsed[2]] = {
                "name": parsed[3],
                "team": [],
            }
            continue
        if parsed[1] != "poke":
            continue


        tqdm.write(f"Found pokemon {parsed[3]} owned by {parsed[2]}")
        data[parsed[2]]["team"].append(parsed[3])
        if len(data[parsed[2]]["team"]) == 6:
            data[parsed[2]]["team"].sort()


    return data

def get_all_replay_links(url: str):
    correct_links = []
    page = 1
    origUrl = url
    lastTitle = ""
    while True:
        try:
            headers = {"User-Agent": "PostmanRuntime/7.32.3"}
            driver = {"headers": [headers]}
            req = requests.get(f"{url}.log", headers=headers)

            print(req)

            soup = BeautifulSoup(req.content, "html.parser")

            title = soup.find_all("title")

            if title == lastTitle:
                return correct_links
            lastTitle = title

            links = soup.find_all("a", href=True)

            for link in links:
                if not ("https://replay.pokemonshowdown.com/smogtours-gen9zu" in link["href"] or "https://replay.pokemonshowdown.com/gen9zu" in link["href"]):
                    continue
                if correct_links.count(link["href"]) > 0:
                    continue

                correct_links.append(link["href"].split("?")[0])
            page += 1
            url = origUrl + "page-" + str(page)


        except:
            return correct_links

links = []

for url in urlList:
    links = links + get_all_replay_links(url)

for x in links:
    print(x)
tqdm.write(f"Found {len(links)} links")

if len(links) > 0:
    output = []

    links.reverse()

    for link in tqdm(links, colour="green", desc="Fetching teams.."):
        teamcontents = get_team_contents(link)

        while teamcontents == {}:
            time.sleep(0.5)
            teamcontents = get_team_contents(link)

            if teamcontents == None:
                continue

        if teamcontents == None:
            continue

        output.append({
            "link": link,
            "teams": teamcontents
        })

    # print(json.dumps(output))
    # print(output)

    f = open("test.json", "w")
    f.write(json.dumps(output))
    f.close()

    # testing
    # print(get_team_contents("https://replay.pokemonshowdown.com/gen81v1-1907362871"))