-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodelExpand.py
More file actions
191 lines (146 loc) · 7.41 KB
/
modelExpand.py
File metadata and controls
191 lines (146 loc) · 7.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# tybens 11/5/2020
import pickle
import pandas as pd
import os
import numpy as np
import argparse # python command line flags
import multiprocessing as mp # multiprocessing!
import math
from itertools import combinations
from sklearn.model_selection import train_test_split
from utils.cleaning import str2bool
from utils.SuperLearner import SuperLearner
def prep(X, y, X_train, X_test, y_train, y_test, VITALS, LABEL, RAND_STATE, OPTIMIZED=None):
""" Prepping the model to be ready to be used by productionApp
Parameters
----------
VITALS : bool
boolean for whether vitals are being worked with or not
LABEL : str
str what to label the saved file with and how to identify models to prep.
RAND_STATE : int
random state for train_test_split, must be the same as modelSearch was done
OPTIMIZED : bool, optional
Default is None. The score metric ('OB', 'OF', 'OP') on which the model was optimized.
"""
filename_superlearner = './models/{}/SuperLearner{}SL.sav'.format(LABEL, OPTIMIZED)
superLearner = pickle.load(open(filename_superlearner, 'rb'))
# fit and score on large dataset
superLearner.fit(X_train, y_train)
scores = superLearner.scores(X_test, y_test)
print(scores)
# save model as superlearner object
filename = './models/{}/MasterModel.sav'.format(LABEL)
pickle.dump(superLearner, open(filename, 'wb'))
# --SAVING (this is for production of the figure!! not really the study)
all_scores = superLearner.predict_proba(X)[:, 1]
filename_all = './production_data/{}/all_scores.csv'.format(LABEL)
all_scores.tofile(filename_all,sep=',',format='%10.5f')
# only need this once!
actual_scores = y
filename_actual = './production_data/{}/actual_scores_{}.csv'.format(LABEL, LABEL)
actual_scores.to_csv(filename_actual, index=False)
def main():
script_dir = os.path.dirname(os.path.abspath(__file__))
dest_dir = os.path.join(script_dir, 'models', LABEL, 'expanded')
try:
os.makedirs(dest_dir)
except OSError:
pass # already exists
# load ALL data that is cleaned to match the features of what the models were trained on
if VITALS:
filename = 'data_vitals_cleaned.csv'
else:
filename = 'data_cleaned.csv'
data_matched = pd.read_csv('./data/'+filename)
X = data_matched.drop('admit_binary', axis=1)
y = data_matched['admit_binary']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2, shuffle=True, random_state=RAND_STATE)
# only using proportion of total sample size
train = pd.concat([X_train, y_train], axis=1)
sampled_train = train.sample(frac=PROP).reset_index(drop=True)
y_train = sampled_train['admit_binary']
X_train = sampled_train.drop('admit_binary', axis=1)
# initial prep: clean and save all the data matched to the data the model was trained on, save as MasterModel
prep(X, y, X_train, X_test, y_train, y_test, VITALS=VITALS, LABEL=LABEL, OPTIMIZED=OPTIMIZED, RAND_STATE=RAND_STATE)
# -- these files are created during the prep() call on line 26 --
# load optimized models that will be expanded (to encompass all combination of features)
filename = './models/{}/MasterModel.sav'.format(LABEL)
superLearner = pickle.load(open(filename, 'rb'))
# columns that might be NaNs
relevant_cols = ['temp', 'HR', 'RR', 'O2', 'BP', 'ambulance', 'age'] if VITALS else ['ambulance', 'age']
# to craft an identifier dataframe:
unique_id = list()
uid = 0
names = list()
# for each possible number of missing NaNs
for i in range(1, len(relevant_cols)+1):
combs = combinations(relevant_cols, i)
# for each combination of this number of missing NaNs
for comb in combs:
# split up BP into sys and dia
cols = [z for z in comb]
if 'BP' in comb:
cols.remove('BP')
cols.extend(['BP_sys', 'BP_dia'])
if 'age' in comb:
cols.extend(['age_group_Adult', 'age_group_Geriatric_65-80',
'age_group_Geriatric_80+', 'age_group_Pediatric'])
name = ','.join(cols)
# save info to identify models
unique_id.append(uid)
uid+=1
names.append(name)
# identifier dataframe
df_id = pd.DataFrame(np.array([unique_id, names]).T, columns=['id', 'name'])
filename = './models/{}/expanded/df_id.sav'.format(LABEL)
pickle.dump(df_id, open(filename, 'wb'))
# split the jobs between 7 cpus
num_cpus = 7
total_expands = len(names)
list_of_names = [zip(unique_id[i:i+int(math.ceil(total_expands/num_cpus))],names[i:i+int(math.ceil(total_expands/num_cpus))]) for i in range(0, total_expands, int(math.ceil(total_expands/num_cpus)))]
# multithreading saving scores
score_queue = mp.Queue()
# initialize threads
workers = [ mp.Process(target=dropColsAndFitAndSave, args=(names, X_train, X_test, y_train, y_test, superLearner, score_queue,) ) for names in list_of_names]
# SPIN UP A COUPLE THREADS heh
[work.start() for work in workers]
[work.join() for work in workers]
ALLSCORES = []
for _ in range(len(workers)):
ALLSCORES.extend(score_queue.get())
return ALLSCORES
def dropColsAndFitAndSave(names, X_train, X_test, y_train, y_test, superLearner, score_queue):
""" To be used as the multithreaded job for speeding up expanding models """
for (uid, name) in names:
cols = name.split(',')
X_train, X_test = X_train.drop(cols, 1), X_test.drop(cols, 1)
# fit the data with dropped cols to new models
superLearner.model_name = name
superLearner.fit(X_train, y_train)
scores = superLearner.scores(X_test, y_test)
score_queue.put(scores)
# save the models
filename = './models/{}/expanded/MasterModel{}.sav'.format(LABEL, uid)
pickle.dump(superLearner, open(filename, 'wb'))
# calculates and saves all_scores for the given fitted model and unique id
X_dropped_cols = pd.concat([X_train, X_test], axis=0)
all_scores = superLearner.predict_proba(X_dropped_cols)[:, 1]
filename_all = './production_data/{}/expanded/all_scores_{}.csv'.format(LABEL, uid)
all_scores.tofile(filename_all,sep=',',format='%10.5f')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--LABEL", "-l", type=str, help="str label of which study to load the models from")
parser.add_argument("--OPTIMIZED", "-o", type=str, help="str how the model was optimized 'OF' for f-score, 'OP' for PRAUC, 'OR' for AUROC")
parser.add_argument("--RAND_STATE", "-rs", type=int, help="This must be the same as it was for the modelSearch.py. Change the random_state through with np and sklearn work")
parser.add_argument("--PROPORTION", "-p", type=float, help="The proportion of the training data to be used during fitting, float between 0 and 1")
args = parser.parse_args()
RAND_STATE = args.RAND_STATE
LABEL = args.LABEL
OPTIMIZED = args.OPTIMIZED
PROP = args.PROPORTION
VITALS = False if 'n' in LABEL else True
ALLSCORES = main()
# save all scores
columns = ['Model', 'AUROC', 'AUPRC']
pd.DataFrame(ALLSCORES, columns=columns).to_csv('./models/{}/ALLEXPANDEDSCORES.csv'.format(LABEL), header=False, index=False, sep=',')