one addition and it gets 25 times slower performance issues

Hi,

this is the original code:

__global__ void

d_recursiveGaussian_float_ver4(float *id, float *od, int w, int h, float a0, float a1, float a2, float a3, float b1, float b2, float coefp, float coefn)

{

    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;

    float s[640];

    float * k = s;

    id += x;    // advance pointers to correct column

    od += x;

   // forward pass

    float xp = 0.0f;  // previous input

    float yp = 0.0f;  // previous output

    float yb = 0.0f;  // previous output by 2

#if CLAMP_TO_EDGE

    xp = *id; yb = coefp*xp; yp = yb;

#endif

   #pragma  unroll 160

    for (int y = 0; y < h; y++) {

        float xc = *k = *id;

        float yc = a0*xc + a1*xp - b1*yp - b2*yb;

        *od = yc;

        id += w; od += w;    // move to next row

        xp = xc; yb = yp; yp = yc;

    }

   // reset pointers to point to last element in column

    id -= w;

    od -= w;

   // reverse pass

    // ensures response is symmetrical

    float xn = 0.0f;

    float xa = 0.0f;

    float yn = 0.0f;

    float ya = 0.0f;

#if CLAMP_TO_EDGE

    xn = xa = *id; yn = coefn*xn; ya = yn;

#endif

    #pragma  unroll 160

    for (int y = h-1; y >= 0; y--) {

        float xc = *id;

        float yc = a2*xn + a3*xa - b1*yn - b2*ya;

        xa = xn; xn = xc; ya = yn; yn = yc;

        *od = *od + yc;

        id -= w; od -= w;  // move to previous row

    }

}

it runs within 2.0 ms.

so,

now i do a small change: i add a simple increment of a pointer ( k+=1; int the first for loop)

__global__ void

d_recursiveGaussian_float_ver4(float *id, float *od, int w, int h, float a0, float a1, float a2, float a3, float b1, float b2, float coefp, float coefn)

{

    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;

    float s[640];

    float * k = s;

    id += x;    // advance pointers to correct column

    od += x;

   // forward pass

    float xp = 0.0f;  // previous input

    float yp = 0.0f;  // previous output

    float yb = 0.0f;  // previous output by 2

#if CLAMP_TO_EDGE

    xp = *id; yb = coefp*xp; yp = yb;

#endif

   #pragma  unroll 160

    for (int y = 0; y < h; y++) {

        float xc = *k = *id;

        float yc = a0*xc + a1*xp - b1*yp - b2*yb;

        *od = yc;

        id += w; od += w;    // move to next row

        xp = xc; yb = yp; yp = yc; k+=1; // SMALL CHANGE 

    }

   // reset pointers to point to last element in column

    id -= w;

    od -= w;

   // reverse pass

    // ensures response is symmetrical

    float xn = 0.0f;

    float xa = 0.0f;

    float yn = 0.0f;

    float ya = 0.0f;

#if CLAMP_TO_EDGE

    xn = xa = *id; yn = coefn*xn; ya = yn;

#endif

    #pragma  unroll 160

    for (int y = h-1; y >= 0; y--) {

        float xc = *id;

        float yc = a2*xn + a3*xa - b1*yn - b2*ya;

        xa = xn; xn = xc; ya = yn; yn = yc;

        *od = *od + yc;

        id -= w; od -= w;  // move to previous row

    }

}

this whole thing gets 25 times slower :blink:

Can someone explain me, why does it happen.

Use the profiler, but I assume you now get an uncoalesced write each loop (no, I do not believe the documentations claim that local memory writes are coalesced, they are usually for single variables but I don’t think that is true for arrays), whereas before the compiler moved the write outside the loop.

oh :o

i "ve got another interesting detail:

when i run both of the functions, they all both get 25 times slower!

Are you not calling cudaThreadSynchronize() before each wall-clock timing measurement?

i do call.

and it works fine again, when i remove k+=1 statement.