python_mgp/mgp.py at master · OxfordML/python_mgp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
# -*- coding: utf-8 -*-
"""
An implementation of the approximate marginal GP.

Garnett, R., Osborne, M., and Hennig, P. Active Learning of Linear
Embeddings for Gaussian Processes. (2014). 30th Conference on
Uncertainty in Artificial Intelligence (UAI 2014).

Suppose we have a Gaussian process model on a latent function f:

  p(f | \theta) = GP(f; \mu(x; \theta), K(x, x'; \theta)),

where \theta are the hyperparameters of the model. Suppose we have a
dataset D = (X, y) of observations and a test point x*. This
function returns the mean and variance of the approximate marginal
predictive distributions for the associated observation value y* and
latent function value f*:

  p(y* | x*, D) = \int p(y* | x*, D, \theta) p(\theta | D) d\theta,
  p(f* | x*, D) = \int p(f* | x*, D, \theta) p(\theta | D) d\theta,

where we have marginalized over the hyperparameters \theta. The
approximate posterior is derived using he "MGP" approximation
described in the paper above.

Notes
-----

This code is only appropriate for GP regression! Exact inference
with a Gaussian observation likelihood is assumed.

This MGP implementation uses numerical gradients, in order to
allow any choice of kernel(s).

Dependencies
------------
- numpy
- GPy - https://github.com/SheffieldML/GPy
- numdifftools - https://github.com/pbrod/numdifftools


Usage
-----

Prediction with the MGP class work similar to GPy.models.GPRegression:

>>> gp = GPy.models.GPRegression(X, Y, kern=kernel)
>>> # Provide location of likelihood hps in model.param_array for now
>>> mgp = MGP(gp, lik_idx=np.array([-1]))
>>> mu_star, var_star = mgp.predict(x_star)

Inputs
------
gp - an instance of GPy.models.GPRegression. Should work with most kernels.
lik_idx - np.ndarray containing indexes of model.param_array
          corresponding to likelihood hps

Ahsan Alvi 2015
asa<at>robots.ox.ac.uk
"""

import numpy as np
import numdifftools as nd
import sys
import GPy


class MGP(object):

    def __init__(self, gp, lik_idx=None, verbose=None):
        """
        Defines all local variables of the MGP. Nomenclature follows
        the paper.
        :param gp: GPy.GP object whose hps are to be marginalised
        :param lik_idx: ndarray. Locations of the likelihood params
                        in the gp.param_array variable.
        """

        if verbose is None:
            self.verbose = False
        else:
            self.verbose = verbose

        self.gp = gp.copy()
        self.theta_backup = self.gp[:].copy()
        self.precision_matrix = None
        self.ind_flexible_params, self.ind_params_fixed = \
            find_free_hps(self.gp)
        self.logexp_params = None
        self._find_logexp_params()
        self._update_H()

        # Indexes of likelihood parameters. This is used to correct
        # the derivative of the posterior variance in
        # order to match the implementation based on GPML
        if lik_idx is None:
            if self.verbose:
                print 'Guessing index of the lik params in m.param_array'
            self.lik_idx = np.array([-1])
            # self._find_likelihood_params()
        else:
            self.lik_idx = lik_idx

    def _update_H(self, verbose=None):
        """
        Updates the posterior of the hyperparameters' Hessian
        """
        if verbose is not None:
            self.verbose = verbose

        if self.verbose:
            print "Computing Hessian of hyperparameter posterior...",
            sys.stdout.flush()

        # The precision_matrix is the negative of the Hessian of the
        # log likelihood. Garnett's Hessian in GPML is evaluated on the
        # *negative* log likelihood, so it isn't negated in his code.
        h = self._nd_gp_hessian()
        self.precision_matrix = -h

        if self.verbose:
            print "Done."
            print "Hessian:\n", h

    def predict(self, x_star, simple=None, delta_frac=None,
                debug=None):
        """
        Computes the quantities mhat, Vhat, their derivatives
        w.r.t. theta and then returns the
        MGP prediction at the locations x_star
        :param x_star: array of input locations to predict
        :param simple: boolean. If true, then the derivatives of
                       the mean and variance are found via a
                       simple central difference method instead
                       of using numdifftools. Default: True.
        :param delta_frac: float. Size of the gap between the
                           hyperparameters as a fraction of their
                           current values.
                           Optional and only used when simple=True.
                           Default = 0.01
        :param debug: bool. If true, then an extra dict is added
                      to the outputs containing intermediate
                      calculations

        :return: mean and variance at x_star as a tuple
        """
        if simple is None:
            simple = True
        if delta_frac is None:
            delta_frac = 0.01
        if debug is None:
            debug = False

        extras = {}  # extra outputs for debugging purposes

        # The MGP has the same mean as the parameterised GP
        param_gp_pred = self.gp.predict(x_star)
        mtilde = param_gp_pred[0].reshape((len(x_star), 1))
        Vhat = param_gp_pred[1].reshape((len(x_star), 1))
        # remove the likelihood variance, so that the prediction of
        # the *latent* function is inflated, not the observations.
        # This follows the matlab implememtation mgp.m line 241
        Vhat -= self.gp.likelihood.param_array
        Vtilde = np.zeros((len(x_star), 1))

        if self.verbose > 1:  # very verbose
            print "param_gp_pred", param_gp_pred
            print "mtilde", mtilde
            print "Vhat", Vhat
        dmhat_dtheta = np.zeros((len(x_star),
                                 len(self.ind_flexible_params)))
        dVhat_dtheta = np.zeros((len(x_star),
                                 len(self.ind_flexible_params)))

        if simple:
            # less accurate derivatives
            if self.verbose:
                print "Running fast prediction...",
            # 0 ------ 1 ------ 2
            # th-dth   th     th+dth
            delta_theta = (self.theta_backup[self.ind_flexible_params] *
                           (delta_frac))
            if self.verbose:
                print "theta:", self.theta_backup[self.ind_flexible_params]
                print "delta_theta:", delta_theta

            # Build the different thetas where each one has one
            # parameter varied by delta_theta.
            # This will be stored in a nested list, where the first idx
            # correponds to the theta we are varying and the second idx
            # will select the theta vector with either th-dth or th+dth
            # Only find differences for theta that are not fixed.
            new_thetas = []
            for ii in xrange(len(self.ind_flexible_params)):
                idx = self.ind_flexible_params[ii]
                th_minus = self.theta_backup.copy()
                th_minus[idx] = th_minus[idx] - delta_theta[ii]
                th_plus = self.theta_backup.copy()
                th_plus[idx] = th_plus[idx] + delta_theta[ii]
                new_thetas.append([th_minus, th_plus])

            if self.verbose > 1:  # very verbose
                print "new_thetas:", new_thetas

            # Same structure as new_thetas above
            mean_preds = []
            var_preds = []
            # Update the gp's hyperparameters and store the pred
            for ii in xrange(len(new_thetas)):
                # minus
                th_minus = new_thetas[ii][0]
                self.gp[:] = th_minus
                gp_pred = self.gp.predict(x_star)
                mean_minus, var_minus = (gp_pred[0][:, 0], gp_pred[1][:, 0])

                # plus
                th_plus = new_thetas[ii][1]
                self.gp[:] = th_plus
                gp_pred = self.gp.predict(x_star)
                mean_plus, var_plus = (gp_pred[0][:, 0], gp_pred[1][:, 0])

                # Store the results
                mean_preds.append([mean_minus, mean_plus])
                var_preds.append([var_minus, var_plus])

            self._reset_local_gp()

            if self.verbose > 1:  # very verbose
                print "mean_preds", mean_preds
                print "var_preds", var_preds

            # Once the mean and variance have been predicted at the
            # difference values of theta, find the derivatives
            # Each row corresponds to the theta, each column to x_star
            for ii in xrange(len(self.ind_flexible_params)):
                curr_dmhat_dtheta = ((mean_preds[ii][1]
                                     - mean_preds[ii][0])
                                     / (2*delta_theta[ii]))
                # print "mean_preds[ii][1]", mean_preds[ii][1].shape
                # print "mean_preds[ii][0]", mean_preds[ii][0].shape
                # print "curr_dmhat_dtheta", curr_dmhat_dtheta.shape
                dmhat_dtheta[:, ii] = curr_dmhat_dtheta
                curr_dVhat_dtheta = ((var_preds[ii][1]
                                      - var_preds[ii][0])
                                     / (2*delta_theta[ii]))
                dVhat_dtheta[:, ii] = curr_dVhat_dtheta

            # Correcting the derivatives w.r.t. likelihood terms
            for idx in self.lik_idx:
                dVhat_dtheta[:, idx] -= 1

            if self.verbose > 1:  # very verbose
                print "dmhat_dtheta", dmhat_dtheta
                print "dVhat_dtheta", dVhat_dtheta

            Vtilde = ((4. / 3.) * Vhat
                      + np.diag(dmhat_dtheta.dot(
                                np.linalg.solve(self.precision_matrix,
                                                dmhat_dtheta.T)))[:, None]
                      + np.diag(dVhat_dtheta.dot(
                                np.linalg.solve(self.precision_matrix,
                                                dVhat_dtheta.T)))[:, None]
                      / (3. * Vhat))
            if self.verbose:
                print "Vtilde", Vtilde

            if self.verbose:
                print "done."
        else:
            # More accurate gradients
            raise(NotImplementedError)

        # Add on the likelihood variance so that this prediction
        # is for the observations again.
        # TODO: how will this work in the multi-output case?
        Vtilde += self.gp.likelihood.param_array

        if debug:
            extras['delta_theta'] = delta_theta
            extras['new_thetas'] = new_thetas
            extras['theta_backup'] = self.theta_backup[
                self.ind_flexible_params]
            extras['mean_preds'] = mean_preds
            extras['var_preds'] = var_preds
            extras['Vhat'] = Vhat
            extras['dmhat_dtheta'] = dmhat_dtheta
            extras['dVhat_dtheta'] = dVhat_dtheta
            extras['precision_matrix'] = self.precision_matrix
            return mtilde, Vtilde, extras
        else:
            return mtilde, Vtilde

    def _reset_local_gp(self):
        """
        Sets the hyperparameters of self.gp to its original values.
        This is needed so that all the numerical derivatives are the same
        and don't rely on the previous evaluation
        """
        self.gp[:] = self.theta_backup.copy()

    def _nd_gp_hessian(self, step_max=None):
        """
        Numerically evaluated the Hessian matrix for all parameters
        of the GP that are not fixed.
        :return: 2D numpy array size len(theta)^2 corresponding to
                 the Hessian
        """
        if step_max is None:
            step_max = 0.1

        # Setting the GP params to the initial values, as they are
        # changed in-place for the different numerical computations
        self._reset_local_gp()

        # Find all the free hyperparameters
        # ind_flexible_params, ind_params_fixed = find_free_hps(self.gp)

        # TODO: convert theta to logexp where relevant
        theta = self.gp.param_array[self.ind_flexible_params]

        def ll_update(m, params, idx_theta):
            """
            Function that updates the GP's params and then returns
            the new log likelihood
            """
            # TODO: Deal with mixed logexp and untransformed hps
            temp = m[:]
            temp[idx_theta] = params
            m[:] = temp
            return m.log_likelihood()

        def ll_func(t):
            """
            Used by numdifftools
            """
            return ll_update(self.gp, t, self.ind_flexible_params)

        # TODO: if all hps are logexp, don't set step_nom or step_max
        H_func = nd.Hessian(ll_func, step_nom=theta, step_max=step_max)
        H = H_func(theta)

        # Setting the parameters to their original values again
        self._reset_local_gp()

        return H

    def _find_likelihood_params(self):
        """
        Find indexes of likelihood parameters and store them in
        self.lik_idx. This is used to correct the derivative of
        the posterior variance.
        """
        errtxt = 'Automatically finding likelihood params not implemented.' +\
                 ' Provide them for now.'
        raise NotImplementedError(errtxt)

    def _find_logexp_params(self):
        """
        Finds all the parameters that are constrained to by positive
        and updates self.logexp_params with their indexes.

        The hessian and derivatives of these parameters will be evaluated
        in log space and then transformed back into Cartesian space
        """

        # The indexes related to logexp parameters are in the tuple
        # with first element equal to sig:
        sig = GPy.core.parameterization.transformations.Logexp._instance
        for con_tuple in self.gp.constraints.items():
            if con_tuple[0] == sig:
                # con_tuple = (Logexp, array([indexes]))
                self.logexp_params = con_tuple[1]

    def transform_theta(self, theta):
        """
        Transforms an input vector (assumed to be theta).
        Hyperparameters that are constrained to be positive are
        transformed into log space.

        returns: transformed theta vector
        """
        # All hps are positive
        if np.sum(self.logexp_params) == len(self.theta_backup):
            return np.log(theta)
        elif np.sum(self.logexp_params) == 0:  # No positive hps
            return theta
        # mixed case
        else:
            tmp = np.log(theta[self.logexp_params])
            theta[self.logexp_params] = tmp
            return theta

    def inverse_transform_theta(self, theta):
        """
        Inverts the Logexp transform from the given theta vector.
        Hyperparameters that are constrained to be positive are
        transformed back.

        returns: untransformed theta vector
        """
        # All hps are positive
        if np.sum(self.logexp_params) == len(self.theta_backup):
            return np.exp(theta)
        elif np.sum(self.logexp_params) == 0:  # No positive hps
            return theta
        # mixed case
        else:
            tmp = np.exp(theta[self.logexp_params])
            theta[self.logexp_params] = tmp
            return theta


def find_free_hps(gp):
    """
    Returns locations of the hyperparameters that are not fixed as
    indexes of the GP's param_Array vector
    :param gp:
    :return:
    """
    # Find all the free hyperparameters
    ind_params_fixed = gp.constraints['fixed']
    temp_idx = np.arange(len(gp[:]))
    ind_flexible_params = np.delete(temp_idx, ind_params_fixed)
    return ind_flexible_params, ind_params_fixed


if __name__ == "__main__":
    pass