Tibia-Fandom-Web-Scrapper/ListScrapper.py at main · alehee/Tibia-Fandom-Web-Scrapper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from bs4 import BeautifulSoup
from urllib.request import urlopen
from Models.ListEntry import ListEntry

class ListScrapper:
    def __init__(self, url):
        self.url = url
        self.listEntries = []
        self.HREF_BASE = 'https://tibia.fandom.com'

    def scrap(self):
        page = urlopen(self.url)
        html = page.read().decode("utf-8")
        soup = BeautifulSoup(html, "html.parser")

        tbody = soup.findAll('table')[0].findAll('tr')

        for row in tbody:
            entry = self.scrapRow(row)

            if entry is not None:
                self.listEntries.append(entry)

        return self.listEntries

    def scrapRow(self, row):
        href = None
        cols = row.findAll('td')
        for col in cols:
            if href is not None:
                break
            anchors = col.findAll('a', href=True)
            if len(anchors) > 0:
                for a in anchors:
                    tempHref = a['href']
                    if tempHref.startswith("/wiki/"):
                        href = self.HREF_BASE + tempHref

        return href