-
Notifications
You must be signed in to change notification settings - Fork 3
Loop reshuffling for warps
Jakob Progsch edited this page Oct 23, 2015
·
5 revisions
The loop optimization example from the STEPS final presentation:
#Code#
#include <stdlib.h>
#include <math.h>
const int N = 4*1024;
const int NI = 16;
const int NO = 16;
void loop0(float *a, const float *x) {
#pragma acc kernels present(x[0:N*NI]) present(a[0:N*NO])
for(int i = 0;i<N;++i) {
a[i*NO + 0] = logf(x[i*NI + 0] + x[i*NI + 1]);
a[i*NO + 1] = logf(x[i*NI + 4]);
a[i*NO + 2] = logf(x[i*NI + 3] + x[i*NI + 3]);
a[i*NO + 3] = logf(x[i*NI + 6] + x[i*NI + 7]);
a[i*NO + 4] = logf(x[i*NI + 8] + x[i*NI + 1]);
a[i*NO + 5] = logf(x[i*NI + 3] + x[i*NI +10] + x[i*NI + 7]);
a[i*NO + 6] = logf(x[i*NI + 2] + x[i*NI + 1]);
a[i*NO + 7] = logf(x[i*NI + 9] + x[i*NI + 5]);
a[i*NO + 8] = logf(x[i*NI +11] + x[i*NI + 1]);
a[i*NO + 9] = logf(x[i*NI +15] + x[i*NI + 8]);
a[i*NO +10] = logf(x[i*NI + 0]);
a[i*NO +11] = logf(x[i*NI + 1] + x[i*NI + 3]);
a[i*NO +12] = logf(x[i*NI +13] + x[i*NI + 2]);
a[i*NO +13] = logf(x[i*NI +14] + x[i*NI + 9]);
a[i*NO +14] = logf(x[i*NI +12] + x[i*NI +11]);
a[i*NO +15] = logf(x[i*NI +10] + x[i*NI + 1]);
}
}
void loop1(float *a, const float *x) {
#pragma acc kernels present(x[0:N*NI]) present(a[0:N*NO])
#pragma acc loop independent
for(int i = 0;i<N;++i) {
a[i*NO + 0] = logf(x[i*NI + 0] + x[i*NI + 1]);
a[i*NO + 1] = logf(x[i*NI + 4]);
a[i*NO + 2] = logf(x[i*NI + 3] + x[i*NI + 3]);
a[i*NO + 3] = logf(x[i*NI + 6] + x[i*NI + 7]);
a[i*NO + 4] = logf(x[i*NI + 8] + x[i*NI + 1]);
a[i*NO + 5] = logf(x[i*NI + 3] + x[i*NI +10] + x[i*NI + 7]);
a[i*NO + 6] = logf(x[i*NI + 2] + x[i*NI + 1]);
a[i*NO + 7] = logf(x[i*NI + 9] + x[i*NI + 5]);
a[i*NO + 8] = logf(x[i*NI +11] + x[i*NI + 1]);
a[i*NO + 9] = logf(x[i*NI +15] + x[i*NI + 8]);
a[i*NO +10] = logf(x[i*NI + 0]);
a[i*NO +11] = logf(x[i*NI + 1] + x[i*NI + 3]);
a[i*NO +12] = logf(x[i*NI +13] + x[i*NI + 2]);
a[i*NO +13] = logf(x[i*NI +14] + x[i*NI + 9]);
a[i*NO +14] = logf(x[i*NI +12] + x[i*NI +11]);
a[i*NO +15] = logf(x[i*NI +10] + x[i*NI + 1]);
}
}
void loop2(float *a, const float *restrict x) {
#pragma acc kernels present(x[0:N*NI]) present(a[0:N*NO])
#pragma acc loop independent
for(int i = 0;i<N;++i) {
a[i*NO + 0] = logf(x[i*NI + 0] + x[i*NI + 1]);
a[i*NO + 1] = logf(x[i*NI + 4]);
a[i*NO + 2] = logf(x[i*NI + 3] + x[i*NI + 3]);
a[i*NO + 3] = logf(x[i*NI + 6] + x[i*NI + 7]);
a[i*NO + 4] = logf(x[i*NI + 8] + x[i*NI + 1]);
a[i*NO + 5] = logf(x[i*NI + 3] + x[i*NI +10] + x[i*NI + 7]);
a[i*NO + 6] = logf(x[i*NI + 2] + x[i*NI + 1]);
a[i*NO + 7] = logf(x[i*NI + 9] + x[i*NI + 5]);
a[i*NO + 8] = logf(x[i*NI +11] + x[i*NI + 1]);
a[i*NO + 9] = logf(x[i*NI +15] + x[i*NI + 8]);
a[i*NO +10] = logf(x[i*NI + 0]);
a[i*NO +11] = logf(x[i*NI + 1] + x[i*NI + 3]);
a[i*NO +12] = logf(x[i*NI +13] + x[i*NI + 2]);
a[i*NO +13] = logf(x[i*NI +14] + x[i*NI + 9]);
a[i*NO +14] = logf(x[i*NI +12] + x[i*NI +11]);
a[i*NO +15] = logf(x[i*NI +10] + x[i*NI + 1]);
}
}
void loop3(float *a, const float *x) {
#pragma acc kernels present(x[0:N*NI]) present(a[0:N*NO])
#pragma acc loop independent
for(int i = 0;i<N;++i) {
a[i + 0*N] = logf(x[i + 0*N] + x[i + 1*N]);
a[i + 1*N] = logf(x[i + 4*N]);
a[i + 2*N] = logf(x[i + 3*N] + x[i + 3*N]);
a[i + 3*N] = logf(x[i + 6*N] + x[i + 7*N]);
a[i + 4*N] = logf(x[i + 8*N] + x[i + 1*N]);
a[i + 5*N] = logf(x[i + 3*N] + x[i +10*N] + x[i + 7*N]);
a[i + 6*N] = logf(x[i + 2*N] + x[i + 1*N]);
a[i + 7*N] = logf(x[i + 9*N] + x[i + 5*N]);
a[i + 8*N] = logf(x[i +11*N] + x[i + 1*N]);
a[i + 9*N] = logf(x[i +15*N] + x[i + 8*N]);
a[i +10*N] = logf(x[i + 0*N]);
a[i +11*N] = logf(x[i + 1*N] + x[i + 3*N]);
a[i +12*N] = logf(x[i +13*N] + x[i + 2*N]);
a[i +13*N] = logf(x[i +14*N] + x[i + 9*N]);
a[i +14*N] = logf(x[i +12*N] + x[i +11*N]);
a[i +15*N] = logf(x[i +10*N] + x[i + 1*N]);
}
}
void loop4(float *a, const float *restrict x) {
#pragma acc kernels present(x[0:N*NI]) present(a[0:N*NO])
#pragma acc loop independent
for(int i = 0;i<N;++i) {
a[i + 0*N] = logf(x[i + 0*N] + x[i + 1*N]);
a[i + 1*N] = logf(x[i + 4*N]);
a[i + 2*N] = logf(x[i + 3*N] + x[i + 3*N]);
a[i + 3*N] = logf(x[i + 6*N] + x[i + 7*N]);
a[i + 4*N] = logf(x[i + 8*N] + x[i + 1*N]);
a[i + 5*N] = logf(x[i + 3*N] + x[i +10*N] + x[i + 7*N]);
a[i + 6*N] = logf(x[i + 2*N] + x[i + 1*N]);
a[i + 7*N] = logf(x[i + 9*N] + x[i + 5*N]);
a[i + 8*N] = logf(x[i +11*N] + x[i + 1*N]);
a[i + 9*N] = logf(x[i +15*N] + x[i + 8*N]);
a[i +10*N] = logf(x[i + 0*N]);
a[i +11*N] = logf(x[i + 1*N] + x[i + 3*N]);
a[i +12*N] = logf(x[i +13*N] + x[i + 2*N]);
a[i +13*N] = logf(x[i +14*N] + x[i + 9*N]);
a[i +14*N] = logf(x[i +12*N] + x[i +11*N]);
a[i +15*N] = logf(x[i +10*N] + x[i + 1*N]);
}
}
void loop5(float *a, const float *restrict x) {
#pragma acc kernels present(x[0:N*NI]) present(a[0:N*NO])
#pragma acc loop independent collapse(3)
for(int j = 0;j<N;j+=32) {
for(int h = 0;h<2;++h) {
for(int k = 0;k<32;++k) {
int i = j+k;
if(h == 0) {
a[i + 0*N] = logf(x[i + 0*N] + x[i + 1*N]);
a[i + 1*N] = logf(x[i + 4*N]);
a[i + 2*N] = logf(x[i + 3*N] + x[i + 3*N]);
a[i + 3*N] = logf(x[i + 6*N] + x[i + 7*N]);
a[i + 4*N] = logf(x[i + 8*N] + x[i + 1*N]);
a[i + 5*N] = logf(x[i + 3*N] + x[i +10*N] + x[i + 7*N]);
a[i + 6*N] = logf(x[i + 2*N] + x[i + 1*N]);
a[i + 7*N] = logf(x[i + 9*N] + x[i + 5*N]);
} else {
a[i + 8*N] = logf(x[i +11*N] + x[i + 1*N]);
a[i + 9*N] = logf(x[i +15*N] + x[i + 8*N]);
a[i +10*N] = logf(x[i + 0*N]);
a[i +11*N] = logf(x[i + 1*N] + x[i + 3*N]);
a[i +12*N] = logf(x[i +13*N] + x[i + 2*N]);
a[i +13*N] = logf(x[i +14*N] + x[i + 9*N]);
a[i +14*N] = logf(x[i +12*N] + x[i +11*N]);
a[i +15*N] = logf(x[i +10*N] + x[i + 1*N]);
}
}
}
}
}
void loop6(float *a, const float *restrict x) {
#pragma acc kernels present(x[0:N*NI]) present(a[0:N*NO])
#pragma acc loop independent collapse(3)
for(int j = 0;j<N;j+=32) {
for(int h = 0;h<4;++h) {
for(int k = 0;k<32;++k) {
int i = j+k;
if(h == 0) {
a[i + 0*N] = logf(x[i + 0*N] + x[i + 1*N]);
a[i + 1*N] = logf(x[i + 4*N]);
a[i + 2*N] = logf(x[i + 3*N] + x[i + 3*N]);
a[i + 3*N] = logf(x[i + 6*N] + x[i + 7*N]);
} else if(h == 1){
a[i + 4*N] = logf(x[i + 8*N] + x[i + 1*N]);
a[i + 5*N] = logf(x[i + 3*N] + x[i +10*N] + x[i + 7*N]);
a[i + 6*N] = logf(x[i + 2*N] + x[i + 1*N]);
a[i + 7*N] = logf(x[i + 9*N] + x[i + 5*N]);
} else if(h == 2){
a[i + 8*N] = logf(x[i +11*N] + x[i + 1*N]);
a[i + 9*N] = logf(x[i +15*N] + x[i + 8*N]);
a[i +10*N] = logf(x[i + 0*N]);
a[i +11*N] = logf(x[i + 1*N] + x[i + 3*N]);
} else {
a[i +12*N] = logf(x[i +13*N] + x[i + 2*N]);
a[i +13*N] = logf(x[i +14*N] + x[i + 9*N]);
a[i +14*N] = logf(x[i +12*N] + x[i +11*N]);
a[i +15*N] = logf(x[i +10*N] + x[i + 1*N]);
}
}
}
}
}
int main() {
float *x = (float*)malloc(sizeof(float)*N*NI);
float *a = (float*)malloc(sizeof(float)*N*NO);
for(int i = 0;i<N*NI;++i) {
x[i] = rand();
}
#pragma acc data copyin(x[0:N*NI]) copyout(a[0:N*NO])
for(int i = 0;i<1;++i) {
loop0(a, x);
loop1(a, x);
loop2(a, x);
loop3(a, x);
loop4(a, x);
loop5(a, x);
loop6(a, x);
}
free(a);
free(x);
return 0;
}
#Timings (Tesla K80)#
Version Duration Grid Size Block Size Regs
Base: 54.666ms (1 1 1) (1 1 1) 13
Loop Independent: 38.912us (32 1 1) (128 1 1) 17
restrict: 23.680us (32 1 1) (128 1 1) 25
SoA: 16.928us (32 1 1) (128 1 1) 17
SoA + restrict: 13.376us (32 1 1) (128 1 1) 25
2x work splitting: 9.1520us (64 1 1) (128 1 1) 21
4x work splitting: 7.1360us (128 1 1) (128 1 1) 21