This repository was archived by the owner on Jun 11, 2024. It is now read-only.
forked from matthewosborne71/NCAA
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel.py
More file actions
215 lines (183 loc) · 7.04 KB
/
model.py
File metadata and controls
215 lines (183 loc) · 7.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
"""
Make a model to predict upsets
"""
import json
import os
import sys
import warnings
import numpy as np
import pickle
import pandas as pd
from sklearn import linear_model as lm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import scale, OneHotEncoder
pd.set_option('display.width', 100)
pd.set_option('display.max_columns', 10)
# Ignore warnings
if not sys.warnoptions:
warnings.simplefilter('default')
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter('ignore', category=DeprecationWarning)
warnings.simplefilter('ignore', category=PendingDeprecationWarning)
def get_columns():
# Open columns
with open('columns.json', 'r') as column_file:
columns = json.load(column_file)
columns.remove('TopScore')
columns.remove('BotScore')
columns.remove('TopTOV')
columns.remove('BotTOV')
columns.remove('TopTOPer')
columns.remove('BotTOPer')
columns.remove('TopTSPer')
columns.remove('BotTSPer')
columns.remove('TopFT')
columns.remove('BotFT')
columns.remove('TopFTA')
columns.remove('BotFTA')
columns.remove('TopFTR')
columns.remove('BotFTR')
columns.remove('TopTravel')
columns.remove('BotTravel')
columns.remove('TopOppPTS')
columns.remove('BotOppPTS')
return columns
def predict(year: int = 2017, model: str = 'model', new: bool = True, col_labels: list = None,
model_type: str = None) -> None:
"""
Train machine learning model for use
:param year: Year to run predictions for
:param model: Model name
:param new: Whether or not to create and train a new model
:param col_labels: Columns to include in analysis
:param model_type: Type of model to use ('forest', 'gbc', 'svm', or None for logistic regression)
:returns: None
"""
# Initialize Data
data = pd.read_csv('datafile.csv')
# data = pd.read_csv('NCAA2001_2017.csv')
# data_2018 = pd.read_csv('NCAA2018.csv')
# data_2018['year'] = 2018
# data = data.append(data_2018, sort=True)
model_file_path = './Models/' + model + '.pickle'
try:
with open(model_file_path, 'rb') as _:
# new is False so algorithm will continue with existing model
pass
except FileNotFoundError:
# new was False but algorithm was not found
print('Model not found. Creating new model...')
new = True
# data to pull from the data frame
if col_labels is None:
'''
col_labels = [
'TopEFGPer', # effective field goal percentage
'TopFTR', # free throw rate
'TopTOPer', # turnover percentage
'TopDRTG', # defensive rating
'TopSOS', # strength of schedule
'BotEFGPer',
'BotFTR',
'BotTOPer',
'BotDRTG',
'BotSOS'
]'''
col_labels = get_columns()
# don't scale SeedType
if 'SeedType' in col_labels:
col_labels.remove('SeedType')
if len(col_labels) != 0:
# Convert data to floats
for column in data:
if data[column].dtype == 'int64':
data[column] = data[column].astype('float64')
data[col_labels] = scale(data[col_labels])
col_labels.insert(0, 'SeedType')
else:
data[col_labels] = scale(data[col_labels])
# change SeedTypes to integers in case need to encode later
data = data.replace(
['OneSixteen', 'TwoFifteen', 'ThreeFourteen',
'FourThirteen', 'FiveTwelve', 'SixEleven',
'SevenTen', 'EightNine'],
[1, 2, 3, 4, 5, 6, 7, 8])
# current input set
test = data.loc[data['year'] == year][col_labels]
# results to display
results_columns = ['SeedType', 'TopSeed', 'BotSeed', 'Upset']
# results_columns = ['TopSeed', 'BotSeed', 'Upset']
test_results = data.loc[data['year'] == year][results_columns]
# Create or Retrieve Model
# if creating new model
if new:
# collect data from correct year and columns
# training set inputs
train = data.loc[(data['year'] != year) &
(data['year'] != 2018)][col_labels]
# create training set answers
# training set outputs
train_results = data.loc[(data['year'] != year) &
(data['year'] != 2018)]['Upset'] # not a df
# have to one-hot the seeding type if that's in there
if 'SeedType' in col_labels:
enc = OneHotEncoder(categorical_features=[0]) # must be first
train = enc.fit_transform(train).toarray()
test = enc.fit_transform(test).toarray()
else:
train = train.as_matrix()
test = test.as_matrix()
# choose model type
if model_type == 'forest':
model = RandomForestClassifier()
elif model_type == 'gbc':
model = GradientBoostingClassifier()
elif model_type == 'svc':
model = SVC(probability=True)
else:
model = lm.LogisticRegression()
# fit data to model (train)
model.fit(train, train_results.as_matrix())
# save model
if not os.path.exists('Models/'):
os.makedirs('Models')
with open(model_file_path, 'wb') as model_file:
pickle.dump(model, model_file)
# get model
else:
# get model
with open(model_file_path, 'rb') as model_file:
model = pickle.load(model_file)
# have to one-hot the seeding type if that's in there
if 'SeedType' in col_labels:
enc = OneHotEncoder(categorical_features=[0]) # must be first
test = enc.fit_transform(test).toarray()
else:
test = test.as_matrix()
# Create Predictions
predictions = model.predict_proba(test)
# add probability to display output
probability = []
for i in range(len(predictions)):
probability.append(predictions[i][1]) # second column is upset percentage
test_results['UpsetProbability'] = probability
test_results['Correct'] = test_results['Upset'] == round(test_results['UpsetProbability'])
# calculate total number correct
test_results['Correct'].replace([True, False], [1, 0], inplace=True)
num_correct = pd.DataFrame(data=test_results[['Correct']].sum()).T
num_correct = num_correct.reindex(columns=test_results.columns)
# change formatting + look for readability
test_results['Correct'].replace([1, 0], ['✓', 'x'], inplace=True)
test_results['Upset'].replace([0.0, 1.0], [0, 1], inplace=True)
# sort predictions
test_results = test_results.sort_values('UpsetProbability', ascending=0)
# add sum column + extra formatting
test_results = test_results.append(num_correct, sort=True)
test_results.replace(np.nan, '', inplace=True)
# output data
print('\n\nYear: %d\n' % year)
print(test_results)
if __name__ == '__main__':
predict(year=2018)