-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathListScrapper.py
More file actions
39 lines (31 loc) · 1.09 KB
/
ListScrapper.py
File metadata and controls
39 lines (31 loc) · 1.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from bs4 import BeautifulSoup
from urllib.request import urlopen
from Models.ListEntry import ListEntry
class ListScrapper:
def __init__(self, url):
self.url = url
self.listEntries = []
self.HREF_BASE = 'https://tibia.fandom.com'
def scrap(self):
page = urlopen(self.url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
tbody = soup.findAll('table')[0].findAll('tr')
for row in tbody:
entry = self.scrapRow(row)
if entry is not None:
self.listEntries.append(entry)
return self.listEntries
def scrapRow(self, row):
href = None
cols = row.findAll('td')
for col in cols:
if href is not None:
break
anchors = col.findAll('a', href=True)
if len(anchors) > 0:
for a in anchors:
tempHref = a['href']
if tempHref.startswith("/wiki/"):
href = self.HREF_BASE + tempHref
return href