Hi,
I am trying to measure speed up using texture memory. Here(kbaseConvolution) I am performing a convolution operation via naive parallel implementation. This kernel takes 3.45ms time to run 640x360 image with a gaussian_kernel of size 11x11.
To further optimize and benefit from spatial locality provided by texture memory, I implemented a similar convolution using texture memory for gaussian_kernel of 11x11 and 640x360 image. However this kernel(kTextureMemConvolution) runs for 95.58ms. Why am I seeing performance downgrade ? I donāt understand this behavior of texture memory?
global void kbaseConvolution(float * i_gdata, float * i_gkernel, int kernelWidth, int nCols,int nRows, int kRadius, float * o_gdata)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
float value = 0.0;
int x0 = 0;
int y0 = 0;
int o_glo c= y * nCols + x;
for (int j = -kRadius; j <= kRadius;j++) {
y0 = y + j;
for (int i = -kRadius; i <= kRadius; i++) {
x0 = x + i;
if (y0 < 0 || x0 < 0 || y0 > nRows || x0 > nCols)
{
value += 0.0;
}
else {
value += i_gdata[y0*nCols + x0] * i_gkernel[(j+kRadius)*kernelWidth + (i+kRadius)];
}
}
}
o_gdata[o_gloc] = value;
}
////// CONVOLUTION USING TEXTURE MEMORY //////
texture<float, 2, cudaReadModeElementType> g_InputTexture;
texture<float, 2, cudaReadModeElementType> g_GaussianKernel;
global void kTextureMemConvolution(int nRows, int nCols, int kRadius, float *o_gdata)
{
int x = IMAD(blockIdx.x, blockDim.x, threadIdx.x);
int y = IMAD( blockIdx.y, blockDim.y, threadIdx.y);
float value = 0;
int y0 = 0, x0 = 0;
for (int j = -kRadius; j <= kRadius; j++) {
y0 = y + j;
for (int i = -kRadius; i <= kRadius; i++) {
x0 = x + i;
if (y0 >= 0 || x0 >= 0 || y0 < nRows || x0 < nCols) {
value += tex2D(g_InputTexture, x0 , y0 ) * tex2D(g_GaussianKernel, (i + kRadius) , (j + kRadius));
}
}
}
o_gdata[IMAD(y,nCols, x)] = (float) value;
}