Skip to content

Loop reshuffling for warps

Jakob Progsch edited this page Oct 23, 2015 · 5 revisions

The loop optimization example from the STEPS final presentation:

#Code#

#include <stdlib.h>
#include <math.h>

const int N = 4*1024;
const int NI = 16;
const int NO = 16;

void loop0(float *a, const float *x) {
#pragma acc kernels present(x[0:N*NI]) present(a[0:N*NO])
    for(int i = 0;i<N;++i) {
        a[i*NO + 0] = logf(x[i*NI + 0] + x[i*NI + 1]);
        a[i*NO + 1] = logf(x[i*NI + 4]);
        a[i*NO + 2] = logf(x[i*NI + 3] + x[i*NI + 3]);
        a[i*NO + 3] = logf(x[i*NI + 6] + x[i*NI + 7]);
        a[i*NO + 4] = logf(x[i*NI + 8] + x[i*NI + 1]);
        a[i*NO + 5] = logf(x[i*NI + 3] + x[i*NI +10] + x[i*NI + 7]);
        a[i*NO + 6] = logf(x[i*NI + 2] + x[i*NI + 1]);
        a[i*NO + 7] = logf(x[i*NI + 9] + x[i*NI + 5]);
        a[i*NO + 8] = logf(x[i*NI +11] + x[i*NI + 1]);
        a[i*NO + 9] = logf(x[i*NI +15] + x[i*NI + 8]);
        a[i*NO +10] = logf(x[i*NI + 0]);
        a[i*NO +11] = logf(x[i*NI + 1] + x[i*NI + 3]);
        a[i*NO +12] = logf(x[i*NI +13] + x[i*NI + 2]);
        a[i*NO +13] = logf(x[i*NI +14] + x[i*NI + 9]);
        a[i*NO +14] = logf(x[i*NI +12] + x[i*NI +11]);
        a[i*NO +15] = logf(x[i*NI +10] + x[i*NI + 1]);
    }
}

void loop1(float *a, const float *x) {
#pragma acc kernels present(x[0:N*NI]) present(a[0:N*NO])
#pragma acc loop independent
    for(int i = 0;i<N;++i) {
        a[i*NO + 0] = logf(x[i*NI + 0] + x[i*NI + 1]);
        a[i*NO + 1] = logf(x[i*NI + 4]);
        a[i*NO + 2] = logf(x[i*NI + 3] + x[i*NI + 3]);
        a[i*NO + 3] = logf(x[i*NI + 6] + x[i*NI + 7]);
        a[i*NO + 4] = logf(x[i*NI + 8] + x[i*NI + 1]);
        a[i*NO + 5] = logf(x[i*NI + 3] + x[i*NI +10] + x[i*NI + 7]);
        a[i*NO + 6] = logf(x[i*NI + 2] + x[i*NI + 1]);
        a[i*NO + 7] = logf(x[i*NI + 9] + x[i*NI + 5]);
        a[i*NO + 8] = logf(x[i*NI +11] + x[i*NI + 1]);
        a[i*NO + 9] = logf(x[i*NI +15] + x[i*NI + 8]);
        a[i*NO +10] = logf(x[i*NI + 0]);
        a[i*NO +11] = logf(x[i*NI + 1] + x[i*NI + 3]);
        a[i*NO +12] = logf(x[i*NI +13] + x[i*NI + 2]);
        a[i*NO +13] = logf(x[i*NI +14] + x[i*NI + 9]);
        a[i*NO +14] = logf(x[i*NI +12] + x[i*NI +11]);
        a[i*NO +15] = logf(x[i*NI +10] + x[i*NI + 1]);
    }
}

void loop2(float *a, const float *restrict x) {
#pragma acc kernels present(x[0:N*NI]) present(a[0:N*NO])
#pragma acc loop independent
    for(int i = 0;i<N;++i) {
        a[i*NO + 0] = logf(x[i*NI + 0] + x[i*NI + 1]);
        a[i*NO + 1] = logf(x[i*NI + 4]);
        a[i*NO + 2] = logf(x[i*NI + 3] + x[i*NI + 3]);
        a[i*NO + 3] = logf(x[i*NI + 6] + x[i*NI + 7]);
        a[i*NO + 4] = logf(x[i*NI + 8] + x[i*NI + 1]);
        a[i*NO + 5] = logf(x[i*NI + 3] + x[i*NI +10] + x[i*NI + 7]);
        a[i*NO + 6] = logf(x[i*NI + 2] + x[i*NI + 1]);
        a[i*NO + 7] = logf(x[i*NI + 9] + x[i*NI + 5]);
        a[i*NO + 8] = logf(x[i*NI +11] + x[i*NI + 1]);
        a[i*NO + 9] = logf(x[i*NI +15] + x[i*NI + 8]);
        a[i*NO +10] = logf(x[i*NI + 0]);
        a[i*NO +11] = logf(x[i*NI + 1] + x[i*NI + 3]);
        a[i*NO +12] = logf(x[i*NI +13] + x[i*NI + 2]);
        a[i*NO +13] = logf(x[i*NI +14] + x[i*NI + 9]);
        a[i*NO +14] = logf(x[i*NI +12] + x[i*NI +11]);
        a[i*NO +15] = logf(x[i*NI +10] + x[i*NI + 1]);
    }
}

void loop3(float *a, const float *x) {
#pragma acc kernels present(x[0:N*NI]) present(a[0:N*NO])
#pragma acc loop independent
    for(int i = 0;i<N;++i) {
        a[i + 0*N] = logf(x[i + 0*N] + x[i + 1*N]);
        a[i + 1*N] = logf(x[i + 4*N]);
        a[i + 2*N] = logf(x[i + 3*N] + x[i + 3*N]);
        a[i + 3*N] = logf(x[i + 6*N] + x[i + 7*N]);
        a[i + 4*N] = logf(x[i + 8*N] + x[i + 1*N]);
        a[i + 5*N] = logf(x[i + 3*N] + x[i +10*N] + x[i + 7*N]);
        a[i + 6*N] = logf(x[i + 2*N] + x[i + 1*N]);
        a[i + 7*N] = logf(x[i + 9*N] + x[i + 5*N]);
        a[i + 8*N] = logf(x[i +11*N] + x[i + 1*N]);
        a[i + 9*N] = logf(x[i +15*N] + x[i + 8*N]);
        a[i +10*N] = logf(x[i + 0*N]);
        a[i +11*N] = logf(x[i + 1*N] + x[i + 3*N]);
        a[i +12*N] = logf(x[i +13*N] + x[i + 2*N]);
        a[i +13*N] = logf(x[i +14*N] + x[i + 9*N]);
        a[i +14*N] = logf(x[i +12*N] + x[i +11*N]);
        a[i +15*N] = logf(x[i +10*N] + x[i + 1*N]);
    }
}

void loop4(float *a, const float *restrict x) {
#pragma acc kernels present(x[0:N*NI]) present(a[0:N*NO])
#pragma acc loop independent
    for(int i = 0;i<N;++i) {
        a[i + 0*N] = logf(x[i + 0*N] + x[i + 1*N]);
        a[i + 1*N] = logf(x[i + 4*N]);
        a[i + 2*N] = logf(x[i + 3*N] + x[i + 3*N]);
        a[i + 3*N] = logf(x[i + 6*N] + x[i + 7*N]);
        a[i + 4*N] = logf(x[i + 8*N] + x[i + 1*N]);
        a[i + 5*N] = logf(x[i + 3*N] + x[i +10*N] + x[i + 7*N]);
        a[i + 6*N] = logf(x[i + 2*N] + x[i + 1*N]);
        a[i + 7*N] = logf(x[i + 9*N] + x[i + 5*N]);
        a[i + 8*N] = logf(x[i +11*N] + x[i + 1*N]);
        a[i + 9*N] = logf(x[i +15*N] + x[i + 8*N]);
        a[i +10*N] = logf(x[i + 0*N]);
        a[i +11*N] = logf(x[i + 1*N] + x[i + 3*N]);
        a[i +12*N] = logf(x[i +13*N] + x[i + 2*N]);
        a[i +13*N] = logf(x[i +14*N] + x[i + 9*N]);
        a[i +14*N] = logf(x[i +12*N] + x[i +11*N]);
        a[i +15*N] = logf(x[i +10*N] + x[i + 1*N]);
    }
}

void loop5(float *a, const float *restrict x) {
#pragma acc kernels present(x[0:N*NI]) present(a[0:N*NO])
#pragma acc loop independent collapse(3)
    for(int j = 0;j<N;j+=32) {
        for(int h = 0;h<2;++h) {
            for(int k = 0;k<32;++k) {
                int i = j+k;
                if(h == 0) {
                    a[i + 0*N] = logf(x[i + 0*N] + x[i + 1*N]);
                    a[i + 1*N] = logf(x[i + 4*N]);
                    a[i + 2*N] = logf(x[i + 3*N] + x[i + 3*N]);
                    a[i + 3*N] = logf(x[i + 6*N] + x[i + 7*N]);
                    a[i + 4*N] = logf(x[i + 8*N] + x[i + 1*N]);
                    a[i + 5*N] = logf(x[i + 3*N] + x[i +10*N] + x[i + 7*N]);
                    a[i + 6*N] = logf(x[i + 2*N] + x[i + 1*N]);
                    a[i + 7*N] = logf(x[i + 9*N] + x[i + 5*N]);
                } else {
                    a[i + 8*N] = logf(x[i +11*N] + x[i + 1*N]);
                    a[i + 9*N] = logf(x[i +15*N] + x[i + 8*N]);
                    a[i +10*N] = logf(x[i + 0*N]);
                    a[i +11*N] = logf(x[i + 1*N] + x[i + 3*N]);
                    a[i +12*N] = logf(x[i +13*N] + x[i + 2*N]);
                    a[i +13*N] = logf(x[i +14*N] + x[i + 9*N]);
                    a[i +14*N] = logf(x[i +12*N] + x[i +11*N]);
                    a[i +15*N] = logf(x[i +10*N] + x[i + 1*N]);
                }
            }
        }
    }
}

void loop6(float *a, const float *restrict x) {
#pragma acc kernels present(x[0:N*NI]) present(a[0:N*NO])
#pragma acc loop independent collapse(3)
    for(int j = 0;j<N;j+=32) {
        for(int h = 0;h<4;++h) {
            for(int k = 0;k<32;++k) {
                int i = j+k;
                if(h == 0) {
                    a[i + 0*N] = logf(x[i + 0*N] + x[i + 1*N]);
                    a[i + 1*N] = logf(x[i + 4*N]);
                    a[i + 2*N] = logf(x[i + 3*N] + x[i + 3*N]);
                    a[i + 3*N] = logf(x[i + 6*N] + x[i + 7*N]);
                } else if(h == 1){
                    a[i + 4*N] = logf(x[i + 8*N] + x[i + 1*N]);
                    a[i + 5*N] = logf(x[i + 3*N] + x[i +10*N] + x[i + 7*N]);
                    a[i + 6*N] = logf(x[i + 2*N] + x[i + 1*N]);
                    a[i + 7*N] = logf(x[i + 9*N] + x[i + 5*N]);
                } else if(h == 2){
                    a[i + 8*N] = logf(x[i +11*N] + x[i + 1*N]);
                    a[i + 9*N] = logf(x[i +15*N] + x[i + 8*N]);
                    a[i +10*N] = logf(x[i + 0*N]);
                    a[i +11*N] = logf(x[i + 1*N] + x[i + 3*N]);
                } else {
                    a[i +12*N] = logf(x[i +13*N] + x[i + 2*N]);
                    a[i +13*N] = logf(x[i +14*N] + x[i + 9*N]);
                    a[i +14*N] = logf(x[i +12*N] + x[i +11*N]);
                    a[i +15*N] = logf(x[i +10*N] + x[i + 1*N]);
                }
            }
        }
    }
}


int main() {
    float *x = (float*)malloc(sizeof(float)*N*NI);
    float *a = (float*)malloc(sizeof(float)*N*NO);

    for(int i = 0;i<N*NI;++i) {
        x[i] = rand();
    }

#pragma acc data copyin(x[0:N*NI]) copyout(a[0:N*NO])
    for(int i = 0;i<1;++i) {
        loop0(a, x);
        loop1(a, x);
        loop2(a, x);
        loop3(a, x);
        loop4(a, x);
        loop5(a, x);
        loop6(a, x);
    }
    free(a);
    free(x);

    return 0;
}

#Timings (Tesla K80)#

Version              Duration            Grid Size      Block Size      Regs
Base:                54.666ms              (1 1 1)         (1 1 1)        13
Loop Independent:    38.912us             (32 1 1)       (128 1 1)        17
restrict:            23.680us             (32 1 1)       (128 1 1)        25
SoA:                 16.928us             (32 1 1)       (128 1 1)        17
SoA + restrict:      13.376us             (32 1 1)       (128 1 1)        25
2x work splitting:   9.1520us             (64 1 1)       (128 1 1)        21
4x work splitting:   7.1360us            (128 1 1)       (128 1 1)        21

Clone this wiki locally