federated_trainer/data_loader.py at master · markmo/federated_trainer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import logging
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes, load_iris

ROOT_DIR = Path(__file__).parent

TEST_SET_RATIO = .15


class DataLoader(object):

    def __init__(self):
        logging.debug('init')
        self.x, self.y, self.x_test, self.y_test = None, None, None, None
        self.seed = 42
        np.random.seed(self.seed)

    def load_data(self, header=None, sep='\t'):
        logging.info(self.load_data.__name__)
        data = pd.read_csv('data/data.csv', header=header, sep=sep)
        x = data[data.columns[:-1]]

        # add constant for intercept
        x[len(data.columns)] = 1
        y = data[data.columns[-1]]

        # scale features
        # x = np.apply_along_axis(self.scale, 0, x)

        self.x = np.asarray(x.values.tolist())
        self.y = np.asarray(y.values.tolist())

        # shuffle
        # rand_idxs = np.random.permutation(x.shape[0])
        # x, y = self.x[rand_idxs, :], self.y[rand_idxs]

        # test_size = int(len(self.x) * TEST_SET_RATIO)
        # test_idx = np.random.choice(x.shape[0], size=test_size, replace=False)
        # train_idx = np.ones(x.shape[0], dtype=bool)
        # train_idx[test_idx] = False
        # x_test, y_test = x[test_idx, :], y[test_idx]
        # x_train, y_train = x[train_idx, :], y[train_idx]

        # return x_train, y_train, x_test, y_test

    @staticmethod
    def scale(x):
        return (x - x.mean()) / x.std()

    def get_data(self):
        logging.info(self.get_data.__name__)
        return self.x, self.y

    @staticmethod
    def prepare_data(x, y):
        # add constant for intercept
        x = np.c_[x, np.ones(x.shape[0])]

        # scale features
        # x = np.apply_along_axis(self.scale, 0, x)

        # shuffle
        rand_idxs = np.random.permutation(x.shape[0])
        x, y = x[rand_idxs, :], y[rand_idxs]

        test_size = int(len(x) * TEST_SET_RATIO)
        test_idx = np.random.choice(x.shape[0], size=test_size, replace=False)
        train_idx = np.ones(x.shape[0], dtype=bool)
        train_idx[test_idx] = False
        x_test, y_test = x[test_idx, :], y[test_idx]
        x_train, y_train = x[train_idx, :], y[train_idx]

        return x_train, y_train, x_test, y_test

    def load_iris_data(self):
        """ Classification set """
        logging.info(self.load_iris_data.__name__)
        iris_dataset = load_iris()
        x = iris_dataset.data
        y = iris_dataset.target

        return self.prepare_data(x, y)

    def load_diabetes_data(self):
        """ Regression set """
        logging.info(self.load_diabetes_data.__name__)
        diabetes_dataset = load_diabetes()
        x = diabetes_dataset.data
        y = diabetes_dataset.target

        return self.prepare_data(x, y)

        # add constant for intercept
        # x = np.c_[x, np.ones(x.shape[0])]

        # shuffle
        # rand_idxs = np.random.permutation(x.shape[0])
        # x, y = x[rand_idxs, :], y[rand_idxs]

        # test_size = int(len(x) * TEST_SET_RATIO)
        # test_idx = np.random.choice(x.shape[0], size=test_size, replace=False)
        # train_idx = np.ones(x.shape[0], dtype=bool)
        # train_idx[test_idx] = False
        # x_test, y_test = x[test_idx, :], y[test_idx]
        # x_train, y_train = x[train_idx, :], y[train_idx]

        # split training set amongst multiple clients
        # The selection is not at random. We simulate the fact that each client
        # sees a potentially very different sample of patients.
        # x, y = {}, {}
        # step = int(x_train.shape[0] / n_subsets)
        # for i in range(n_subsets):
        #     x[i] = x_train[step * i: step * (i + 1), :]
        #     y[i] = y_train[step * i: step * (i + 1)]

        # return x_train, y_train, x_test, y_test