forked from yellekelyk/scrape-spec
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSpec2006Data.py
More file actions
111 lines (87 loc) · 3.74 KB
/
Spec2006Data.py
File metadata and controls
111 lines (87 loc) · 3.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from SpecDataBase import *
import SpecDataElem
import Table
from ordereddict import OrderedDict
import pdb
import urllib
import BeautifulSoup
class Spec2006Data(SpecDataBase):
"A class that parses and holds spec2006 data"
def __init__(self, soup, elem=SpecDataElem):
SpecDataBase.__init__(self,soup,elem=elem)
def htmlTables(self, soup):
return soup.findAll(attrs={"class":"idx_table"})
def parseTable(self, tab):
table = Table.Table(str(tab.a.text), self.getElem()().attrs())
headers = OrderedDict()
hdrs = tab.findAll(attrs={"class":"header"})
for hdr in hdrs:
next = hdr.th
while next:
try:
headers[str(next.get("class"))] = 0
except AttributeError:
pass
next = next.nextSibling
line = tab.tbody.tr
#loop through all lines in table
while line:
# ignore intertable headers
if line.get("class") != "intertable odd header":
saveData = self.getElem()()
#saveData = SpecDataElem.SpecDataElem()
entry = line.td
while entry:
if entry != '\n':
attr = entry.get("class")
data = str(entry.text)
if entry.firstText() != None:
tmp = entry.firstText()
data = str(tmp.previousSibling)
data = data.replace(' ', '')
#print attr, data
saveData.update(attr,data)
# this is where we get the link to hw_model
if attr == "hw_model":
link = str("http://www.spec.org/cpu2006/results/" +
str(entry.a['href']))
html = urllib.urlopen(link).read()
saveData.update("link", link)
soup = BeautifulSoup.BeautifulSoup(html)
self.__parseDetails__(saveData, soup)
entry = entry.nextSibling
table.addEntry(saveData)
# go to next line
line = line.findNextSibling("tr")
return table
def __parseDetails__(self, saveData, soup):
saveData.update("hw_avail",
str(soup.find(attrs={"id":"hw_avail_val"}).text))
tab = soup.find(attrs={"id":"Hardware"})
line = tab.tbody.tr
#loop through all lines in table
while line:
attr = str(line.th.text)
data = str(line.td.text)
saveData.update(attr,data)
# go to next line
line = line.findNextSibling("tr")
results = soup.find(attrs={"class":"resultstable"})
line = results.table.tbody.tr
while line:
entry = line.td
if entry.get("class") != "bm":
raise Exception("Expected class 'bm', got " +
entry.get("class"))
testName = str(entry.text).replace(' ', '')
testScore = None
while entry:
if entry.get("class") == "basecol ratio selected":
testScore = str(entry.text).replace(' ', '')
entry = entry.findNextSibling("td")
if testScore == None:
print "Missing score for " + testName
#raise Exception("Missing score for " + testName)
#pdb.set_trace()
saveData.update(testName, testScore)
line = line.findNextSibling("tr")