Surrender_Analysis/HPSearch_sklearn.py at master · mtkier94/Surrender_Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import pandas as pd
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

# hp tuning with sklearn
from sklearn.model_selection import cross_val_score, KFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from hyperopt import fmin, tpe, STATUS_OK, Trials
import os.path, joblib, sys, time, mlflow

# import functions
from functions.sub_surrender_models import reshape_model_input
from functions.sub_surrender_profiles import get_model_input
from functions.sub_sklearn_hyperopt import get_search_space

poly_degree_max = 4 # see literature references in paper (>4 leads to numerical instability and high multicolinearity)


def run_hpsearch(surrender_profile: int, resampling = 'None'):
    '''
    Load data -> define objective -> get hp_search_space -> run hpsearch for logit, random_forest and xgboost
    '''
    # path variables
    cwd = os.path.dirname(os.path.realpath(__file__))
    path_save_models = os.path.join(cwd,r'profile_{}'.format(surrender_profile))
    path_data = os.path.join(cwd,r'profile_{}'.format(surrender_profile))

    # adjust path is resampling is applied (optional)
    if resampling == 'SMOTE':
        path_save_models = os.path.join(os.path.join(path_save_models, r'models'), r'SMOTE')
    elif resampling == 'undersampling':
        path_save_models = os.path.join(os.path.join(path_save_models, r'models'), r'Undersampling')
    elif resampling == 'None':
            pass
    else:
        raise ValueError('resampling type unknown!')


    # import Training and Test Data
    X_train = pd.read_csv(os.path.join(path_data, r'X_train.csv'), index_col= 0)
    y_train = pd.read_csv(os.path.join(path_data, r'y_train.csv'), index_col= 0).values.flatten()

    if resampling == 'SMOTE':
        X_train,y_train = SMOTE().fit_resample(X_train,y_train)
        X_train,y_train = shuffle(X_train,y_train)
    elif resampling == 'undersampling':
        X_train,y_train = RandomUnderSampler(sampling_strategy= 'majority').fit_resample(X_train,y_train)
        X_train,y_train = shuffle(X_train,y_train)
    elif resampling == 'None':
        pass
    else:
        raise ValueError('Input not compatible!')

    # restrict data to relevant features -> assume proper exploratory data analysis
    features_profile_lst = get_model_input(surrender_profile)
    X_train= X_train[[el for el in features_profile_lst]]


    # # Construction of Models
    n_input = X_train.shape[1]
    X_train_logit = reshape_model_input(X_train, degrees_lst = [poly_degree_max]*n_input)


    def objective(params):
        '''
        Define the objective of the hparam-search. We use the neg_log_loss (alias bin. cross_entropy) and K=3 fold CF
        '''
        classifier_type = params['type']
        del params['type']
        print(params)
        cross_val_obj = KFold(n_splits=3) # Note: shuffle=True introduces rstate as hparam which then would also be tuned
        tic = time.time()
        if classifier_type == 'rf':
            clf = RandomForestClassifier(**params)
            entropy = cross_val_score(clf, X_train, y_train, cv = cross_val_obj, scoring = 'neg_log_loss', n_jobs=-1).mean()
        elif classifier_type == 'logit':
            clf = LogisticRegression(**params)
            entropy = cross_val_score(clf, X_train_logit, y_train, cv = cross_val_obj, scoring = 'neg_log_loss', n_jobs=-1).mean()
        elif classifier_type == 'xgboost':
            clf = XGBClassifier(**params)
            entropy = cross_val_score(clf, X_train, y_train, cv = cross_val_obj, scoring = 'neg_log_loss', n_jobs=-1).mean()
        else:
            raise ValueError('Unknown classifier_type!')

        # Note: fmin() tries to minimize the objective.
        # scoring 'neg_log_loss' already returns entropy with negative sign -> has to be cancelled out for minimization
        return {'loss': -entropy, 'status': STATUS_OK, 'eval_time': time.time()-tic}


    for name in ['logit', 'rf',  'xgboost']:
        trials = Trials() # start logging information
        search_space = get_search_space(name)

        if (name == 'logit') and (resampling == 'SMOTE'):
            search_space['penalty']= 'l2' # l1 computationally inefficient
            eval_number = 32
            try:
                del search_space['random_state']
            except:
                pass
        else:
            eval_number = 128
        # start the hparam-search
        with mlflow.start_run():
            _ = fmin(
                fn=objective,
                space=search_space,
                algo=tpe.suggest,
                max_evals= eval_number,
                trials= trials)

        # save hp-tuning result
        joblib.dump(trials, os.path.join(path_save_models, r'hyperopt_{}.pkl'.format(name)))


if __name__ == '__main__':

    # choose resampling type: cmd-line input argument
    try:
        res_type = sys.argv[1] # automated user input to be run with python 'filename' profile_nr
        if res_type not in ['None', 'SMOTE', 'undersampling']:
            raise ValueError('Unknown type of resampling. User input not compatible.')
    except:
        res_type = 'undersampling'

    for i in range(4):
        run_hpsearch(surrender_profile=i, resampling = res_type)