SelectSensor/cuda_kernals.py at master · Wings-Lab/SelectSensor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
'''
CUDA kernals
'''

import math
from numba import cuda, float64


local_array_size = 100   # Global variables are treated as constants. Always initialize the largest size, i.e. the budget


@cuda.jit('float64(float64)', device=True)
def q_function(x):
    '''q_function(x) = 1 - norf_cdf(x). However, numba does not support scipy.stats.norm.
       So we have to work around this issue. The way to fix this is to use math.erf to substitude scipy.stats.norm
       https://stats.stackexchange.com/questions/187828/how-are-the-error-function-and-standard-normal-distribution-function-related
    '''
    return 1. - 0.5*(1. + math.erf(x/math.sqrt(2.)))


@cuda.jit('void(float64[:,:], int64[:], int64, int64, float64[:])', device=True)
def set_pj_pi(meanvec_array, subset_index, j, i, pj_pi):
    '''1D array minus. C = A - B
    Attributes:
        A, B, C (array-like)
    '''
    index = 0
    for k in subset_index:
        pj_pi[index] = meanvec_array[j, k] - meanvec_array[i, k]
        index += 1


@cuda.jit('float64(float64[:], float64[:,:], float64[:], int64)', device=True)
def matmul(A, B, C, size):
    '''Implement np.dot(np.dot(pj_pi, sub_cov_inv), pj_pi)
       1. C = np.dot(A, B)
       2. return np.dot(C, A)
    Attributes:
        A (1D array): pj_pi
        B (2D array): sub_cov_inv
        C (1D array): tmp array
        size (int64): the size of A, B, C need to be used in computation.
                      note that we allocated a large enough space for every case, in all but one case, some memory is wasted
    Return:
        (float64)
    '''
    for i in range(size):
        summation = 0.
        for k in range(size):
            summation += A[k] * B[k, i]      # C = np.dot(A, B)
        C[i] = summation
    summation = 0.
    for i in range(size):
        summation += C[i] * A[i]             # np.dot(C, A)
    return summation


@cuda.jit('void(float64[:,:], int64[:], float64[:,:], float64, float64[:,:])')
def o_t_approx_kernal(meanvec_array, subset_index, sub_cov_inv, priori, results):
    '''The kernal for o_t_approx. Each thread executes a kernal, which is responsible for one element in results array.
    Attributes:
        meanvec_array (np 2D array): contains the mean vector of every transmitter
        subset_index (np 1D array):  index of some sensors
        cov_inv (np 2D array):       inverse of a covariance matrix
        priori (float64):            the prior of each hypothesis
        results (np 2D array):       save the all the results
    '''
    i, j = cuda.grid(2)
    if i < results.shape[0] and j < results.shape[1] and i != j:  # warning: in Linux simulator, need to consider case i ==j
        pj_pi = cuda.local.array(local_array_size, dtype=float64)
        tmp = cuda.local.array(local_array_size, dtype=float64)
        set_pj_pi(meanvec_array, subset_index, j, i, pj_pi)
        results[i, j] = q_function(0.5 * math.sqrt(matmul(pj_pi, sub_cov_inv, tmp, subset_index.size))) * priori


@cuda.jit('float64(int64, int64, int64)', device=True)
def distance_of_hypothesis(i, j, hypothesis_num):
    '''Compute the euclidian distance between two hypothesis.
    Attributes:
        i (int): i_{th} hypothesis
        j (int): j_{th} hypothesis
        hypothesis_num (int): the total number of hypothesis, equals to grid_len^2
    '''
    grid_len = int(math.sqrt(float(hypothesis_num)))
    i_x = i//grid_len
    i_y = i%grid_len
    j_x = j//grid_len
    j_y = j%grid_len
    return math.sqrt(float((i_x - j_x)**2 + (i_y - j_y)**2))


@cuda.jit('void(float64[:,:], int64[:], float64[:,:], float64, float64[:,:])')
def o_t_approx_dist_kernal(meanvec_array, subset_index, sub_cov_inv, priori, results):
    '''The kernal for o_t_approx_dist with distance weight. Each thread executes a kernal, which is responsible for one element in results array.
    Attributes:
        meanvec_array (np 2D array): contains the mean vector of every transmitter
        subset_index (np 1D array):  index of some sensors
        cov_inv (np 2D array):       inverse of a covariance matrix
        priori (float64):            the prior of each hypothesis
        results (np 2D array):       save the all the results
    '''
    i, j = cuda.grid(2)
    if i < results.shape[0] and j < results.shape[1] and i != j:  # warning: in Linux simulator, need to consider case i == j
        pj_pi = cuda.local.array(local_array_size, dtype=float64)
        tmp = cuda.local.array(local_array_size, dtype=float64)
        set_pj_pi(meanvec_array, subset_index, j, i, pj_pi)
        distance = distance_of_hypothesis(i, j, results.shape[0])
        results[i, j] = distance * q_function(0.5 * math.sqrt(matmul(pj_pi, sub_cov_inv, tmp, subset_index.size))) * priori


@cuda.jit('void(float64[:,:], int64[:], float64[:,:], float64[:,:])')
def o_t_kernal(meanvec_array, subset_index, sub_cov_inv, results):
    '''The kernal for o_t. Each thread executes a kernal, which is responsible for one element in results array.
    Attributes:
        meanvec_array (np 2D array): contains the mean vector of every transmitter
        subset_index (np 1D array):  index of some sensors
        cov_inv (np 2D array):       inverse of a covariance matrix
        results (np 2D array):       save the all the results
    '''
    i, j = cuda.grid(2)
    if i < results.shape[0] and j < results.shape[1]:  # warning: in Linux simulator, need to consider case i ==j
        if i == j:
            results[i, j] = 1.
        else:
            pj_pi = cuda.local.array(local_array_size, dtype=float64)
            tmp = cuda.local.array(local_array_size, dtype=float64)
            set_pj_pi(meanvec_array, subset_index, j, i, pj_pi)
            results[i, j] = (1 - q_function(0.5 * math.sqrt(matmul(pj_pi, sub_cov_inv, tmp, subset_index.size))))