forked from facebookarchive/fbcuda
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathIntegerDivisionPerfTest.cu
More file actions
71 lines (52 loc) · 1.77 KB
/
IntegerDivisionPerfTest.cu
File metadata and controls
71 lines (52 loc) · 1.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
// Copyright 2004-present Facebook. All Rights Reserved.
#include "cuda/IntegerDivision.cuh"
#include <stdio.h>
using namespace std;
using namespace facebook::cuda;
#define kNumBlocks 1200
#define kNumThreadsPerBlock 32 * 4
#define kNumElemPerThread 64
__device__ int getStartOffset() {
return blockIdx.x * kNumThreadsPerBlock * kNumElemPerThread +
threadIdx.x;
}
template <typename T>
__global__ void divideFixed(int num, T* data, T div) {
int sum = 0;
for (int i = 0; i < kNumElemPerThread; ++i) {
T v = data[i * kNumThreadsPerBlock + getStartOffset()];
T d = v / div;
T r = v % div;
sum += d + r;
}
data[getStartOffset()] = sum;
}
template <typename T>
__global__ void divideMagicFixed(int num, T* data, FixedDivisor<T> div) {
T sum = 0;
for (int i = 0; i < kNumElemPerThread; ++i) {
T v = data[i * kNumThreadsPerBlock + getStartOffset()];
T d, r;
div.divMod(v, &d, &r);
sum += d + r;
}
data[getStartOffset()] = sum;
}
int main(int argc, char** argv) {
int d = 55;
int num = kNumBlocks * kNumThreadsPerBlock * kNumElemPerThread;
unsigned int* dev = NULL;
cudaMalloc(&dev, num * sizeof(unsigned int));
cudaMemset(dev, 5, num * sizeof(unsigned int));
divideMagicFixed<int><<<kNumBlocks, kNumThreadsPerBlock>>>(
num, (int*) dev, FixedDivisor<int>(d));
cudaMemset(dev, 5, num * sizeof(unsigned int));
divideMagicFixed<unsigned int><<<kNumBlocks, kNumThreadsPerBlock>>>(
num, dev, FixedDivisor<unsigned int>(d));
cudaMemset(dev, 5, num * sizeof(unsigned int));
divideFixed<int><<<kNumBlocks, kNumThreadsPerBlock>>>(num, (int*) dev, d);
cudaMemset(dev, 5, num * sizeof(unsigned int));
divideFixed<unsigned int><<<kNumBlocks, kNumThreadsPerBlock>>>(num, dev, d);
cudaDeviceSynchronize();
cudaFree(dev);
}