-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patharmoryScrape.py
More file actions
107 lines (86 loc) · 3.04 KB
/
armoryScrape.py
File metadata and controls
107 lines (86 loc) · 3.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Scrape all equipment to new json dict
# Outputs armory_v#.json
import bs4
import requests
import myScrapingLib as msl
from myScrapingLib import getSoup
import json
def headCellName(headCell):
colRep = headCell.find('a')
if(colRep is None):
return headCell.text.strip()
else:
return colRep.attrs['title']
def entryCellVal(entryCell):
cellLink = entryCell.find('a')
if(cellLink is None):
return entryCell.text
else:
return cellLink.attrs['title']
wikiNS = 'http://oldschoolrunescape.wikia.com'
tableIndex = '/wiki/Category:Slot_tables'
tableIndexSoup = getSoup(wikiNS+tableIndex)
titledLinks = tableIndexSoup.select('a[title]')
slotTableLinks = [link for link in titledLinks if link.attrs['title'].endswith('slot table')]
tableDict = {}
for link in slotTableLinks:
attrs = link.attrs
title = attrs['title']
href = attrs['href']
slotType = title.split(' ')[0]
slotTableLink = wikiNS + href
tableDict[slotType] = slotTableLink
#print('Slot: ' + slotType)
#print('full link: ' + slotTableLink)
# Made into dict to remove dupes
print('From dict of table page uris')
print('Making dict of table heads and body elems')
equipmentDict = {}
i = 0
print(tableDict.keys())
for (tableType,value) in tableDict.items():
print(tableType + ' : ' + value)
tablePage = getSoup(value)
tableInPage = tablePage.select('table[class~=wikitable]')[0]
body = tableInPage.find('tbody')
# assume header is first row in tbody with no 'td'
head = body.find(lambda tag: len(tag.find_all('td')) == 0)
headColumns = head.find_all('th')
num_columns = len(headColumns)
column_names = ['name'] # First th unlike rest
for th in headColumns:
try:
column_names.append(th.find('a').attrs['title'])
except:
pass
# assume entries are rows in tbody with all 'td'
# size of each entry must match size of header
tableEntries = body.find_all(lambda tag: len(tag.find_all('td'))==num_columns)
print(len(tableEntries))
for entry in tableEntries:
entryCells = entry.find_all('td')
#assume first entry cell has link out
print(entryCells[0].find('a').attrs['href'])
entry_resource = entryCells[0].find('a').attrs['href']
resourceToken = msl.rt_dict.getToken(entry_resource)
equipmentDict[resourceToken] = {}
equipmentDict[resourceToken]['type'] = tableType # key corresponding to equipment table type
for index, cell in enumerate(entryCells):
equipmentDict[resourceToken][column_names[index]] \
= cell.text.strip()
equipmentSoup = msl.getSoup(wikiNS+entry_resource)
info_box = msl.osrsInfoBox(equipmentSoup)
for key, val in info_box.items():
equipmentDict[resourceToken][key] = val
equipmentDict[resourceToken]['article-text'] = msl.osrsAsNL(equipmentSoup)
i += 1
if i == 10:
with open('armory_10_v2.json','w') as armory_file:
armory_file.write(json.dumps(equipmentDict))
with open('armory_v2.json','w') as armory_file:
json_string = json.dumps(equipmentDict)
white_space_markers = ['\\u00a0', '\\n', '\\u2022']
for wsm in white_space_markers:
json_string = json_string.replace(wsm, ' ')
armory_file.write(json_string)
msl.saveTokenDicts()