Hi,
this is the original code:
__global__ void
d_recursiveGaussian_float_ver4(float *id, float *od, int w, int h, float a0, float a1, float a2, float a3, float b1, float b2, float coefp, float coefn)
{
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
float s[640];
float * k = s;
id += x; // advance pointers to correct column
od += x;
// forward pass
float xp = 0.0f; // previous input
float yp = 0.0f; // previous output
float yb = 0.0f; // previous output by 2
#if CLAMP_TO_EDGE
xp = *id; yb = coefp*xp; yp = yb;
#endif
#pragma unroll 160
for (int y = 0; y < h; y++) {
float xc = *k = *id;
float yc = a0*xc + a1*xp - b1*yp - b2*yb;
*od = yc;
id += w; od += w; // move to next row
xp = xc; yb = yp; yp = yc;
}
// reset pointers to point to last element in column
id -= w;
od -= w;
// reverse pass
// ensures response is symmetrical
float xn = 0.0f;
float xa = 0.0f;
float yn = 0.0f;
float ya = 0.0f;
#if CLAMP_TO_EDGE
xn = xa = *id; yn = coefn*xn; ya = yn;
#endif
#pragma unroll 160
for (int y = h-1; y >= 0; y--) {
float xc = *id;
float yc = a2*xn + a3*xa - b1*yn - b2*ya;
xa = xn; xn = xc; ya = yn; yn = yc;
*od = *od + yc;
id -= w; od -= w; // move to previous row
}
}
it runs within 2.0 ms.
so,
now i do a small change: i add a simple increment of a pointer ( k+=1; int the first for loop)
__global__ void
d_recursiveGaussian_float_ver4(float *id, float *od, int w, int h, float a0, float a1, float a2, float a3, float b1, float b2, float coefp, float coefn)
{
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
float s[640];
float * k = s;
id += x; // advance pointers to correct column
od += x;
// forward pass
float xp = 0.0f; // previous input
float yp = 0.0f; // previous output
float yb = 0.0f; // previous output by 2
#if CLAMP_TO_EDGE
xp = *id; yb = coefp*xp; yp = yb;
#endif
#pragma unroll 160
for (int y = 0; y < h; y++) {
float xc = *k = *id;
float yc = a0*xc + a1*xp - b1*yp - b2*yb;
*od = yc;
id += w; od += w; // move to next row
xp = xc; yb = yp; yp = yc; k+=1; // SMALL CHANGE
}
// reset pointers to point to last element in column
id -= w;
od -= w;
// reverse pass
// ensures response is symmetrical
float xn = 0.0f;
float xa = 0.0f;
float yn = 0.0f;
float ya = 0.0f;
#if CLAMP_TO_EDGE
xn = xa = *id; yn = coefn*xn; ya = yn;
#endif
#pragma unroll 160
for (int y = h-1; y >= 0; y--) {
float xc = *id;
float yc = a2*xn + a3*xa - b1*yn - b2*ya;
xa = xn; xn = xc; ya = yn; yn = yc;
*od = *od + yc;
id -= w; od -= w; // move to previous row
}
}
this whole thing gets 25 times slower :blink:
Can someone explain me, why does it happen.