Image Convolution [src added]

Hi…

I always appreciate yours helps…^^

This time my problem is…

I want to minize the Take Time of Image convolution
but my result is just 328msec

i write my code here

would you check my code and tell me what is the problem…

this is my first code in cuda…so i think many thing is not appropriate…

8800 GTX

Image Size : 4096 x 3072
Kernel Size : 15 X15

Take Time : 328 msec ( CUDA )
1000 msec ( OpenCV )

.cubin file info
name = _Enhance
lmem = 4124
smem = 1068
reg = 12

Here is My Source
////////////////// host code
dim3 dimGrid(8,32, 1);
dim3 dimBlock(16, 4, 1);
_Enhance<<<dimGrid, dimBlock>>>(devSrc, devDst, width, height, devKernel, kernelwidth,kernelheight);

////////////////// device code
global void _Enhance(unsigned char src, unsigned char dst, int width, int height, float _kernel, int kernelwidth, int kernelheight)
{
const int nWindowX = _iAlignUp(width, (blockDim.x
gridDim.x))/(blockDim.x
gridDim.x);
const int nWindowY = _iAlignUp(height,(blockDim.y
gridDim.y))/(blockDim.y*gridDim.y);

const int nStartX = (threadIdx.x + blockIdx.x*blockDim.x)*nWindowX;	const int nStartY = (threadIdx.y + blockIdx.y*blockDim.y)*nWindowY;	
const int nCenterX = kernelwidth/2;			const int nCenterY = kernelheight/2;

int nConvStartX, nConvStartY, i, j, l, k, nStartIndex;
float fSum;

int nConvWidth, nConvHeight;
if( threadIdx.x == (blockDim.x-1) && blockIdx.x == (gridDim.x-1) )  nConvWidth = nWindowX-kernelwidth+1;
else																nConvWidth = nWindowX;
if( threadIdx.y == (blockDim.y-1) && blockIdx.y == (gridDim.y-1) )  nConvHeight = nWindowY-kernelheight+1;
else																nConvHeight = nWindowY;

// Create kernel in Shared Memory => 
__shared__ float kernel[16][16];

for( i=0; i<kernelheight; i++) {
	for( j=0; j<kernelwidth; j++ ) {
		kernel[i][j] = _kernel[i*kernelwidth+j];
	}
}
// <= Create Gaussian kernel in Shared Memory 


// Create SrcImage Buffer in Local Memory => 
unsigned char SrcImage[64][64];

for( i=0;i<nConvHeight+kernelheight; i++ ) {
	for( j=0;j<nConvWidth+kernelwidth; j++ ) {
		SrcImage[i][j] = src[nStartY*width+nStartX + i*width + j];
	}
}
// <= Create SrcImage Buffer in Shared Memory 


// Convolution => 
for( i=0; i<nConvHeight; i++ ) {
	for( j=0; j<nConvWidth; j++ ) {
		// Convolution
		fSum = 0;
		nConvStartX = nStartX + j;
		nConvStartY = nStartY + i;
		nStartIndex = IMUL(nConvStartY,width) + nConvStartX;
		for( k=0; k<kernelheight; k++) {
			for( l=0; l<kernelwidth;l++ ) {		
				fSum += (SrcImage[i+k][j+l]*kernel[k][l]);								}
		}

		fSum += 127;
		if( fSum>255 )	fSum = 255;
		if( fSum<0)		fSum = 0;
	
		dst[nStartIndex +IMUL(nCenterY,width) + nCenterX] = fSum;
		
	}
}	
// <= Convolution 

}

Between the code-tags its better readable.

////////////////// host code

dim3 dimGrid(8,32, 1);

dim3 dimBlock(16, 4, 1);

_Enhance<<<dimGrid, dimBlock>>>(devSrc, devDst, width, height, devKernel, kernelwidth,kernelheight);

////////////////// device code

__global__ void _Enhance(unsigned char *src, unsigned char *dst, int width, int height, float *_kernel, int kernelwidth, int kernelheight)

{

const int nWindowX = _iAlignUp(width, (blockDim.x*gridDim.x))/(blockDim.x*gridDim.x);

const int nWindowY = _iAlignUp(height,(blockDim.y*gridDim.y))/(blockDim.y*gridDim.y);

const int nStartX = (threadIdx.x + blockIdx.x*blockDim.x)*nWindowX; const int nStartY = (threadIdx.y + blockIdx.y*blockDim.y)*nWindowY;

const int nCenterX = kernelwidth/2; const int nCenterY = kernelheight/2;

int nConvStartX, nConvStartY, i, j, l, k, nStartIndex;

float fSum;

int nConvWidth, nConvHeight;

if( threadIdx.x == (blockDim.x-1) && blockIdx.x == (gridDim.x-1) ) nConvWidth = nWindowX-kernelwidth+1;

else nConvWidth = nWindowX;

if( threadIdx.y == (blockDim.y-1) && blockIdx.y == (gridDim.y-1) ) nConvHeight = nWindowY-kernelheight+1;

else nConvHeight = nWindowY;

// Create kernel in Shared Memory =>

__shared__ float kernel[16][16];

for( i=0; i<kernelheight; i++) {

for( j=0; j<kernelwidth; j++ ) {

kernel[i][j] = _kernel[i*kernelwidth+j];

}

}

// <= Create Gaussian kernel in Shared Memory

// Create SrcImage Buffer in Local Memory =>

unsigned char SrcImage[64][64];

for( i=0;i<nConvHeight+kernelheight; i++ ) {

for( j=0;j<nConvWidth+kernelwidth; j++ ) {

SrcImage[i][j] = src[nStartY*width+nStartX + i*width + j];

}

}

// <= Create SrcImage Buffer in Shared Memory

// Convolution =>

for( i=0; i<nConvHeight; i++ ) {

for( j=0; j<nConvWidth; j++ ) {

// Convolution

fSum = 0;

nConvStartX = nStartX + j;

nConvStartY = nStartY + i;

nStartIndex = IMUL(nConvStartY,width) + nConvStartX;

for( k=0; k<kernelheight; k++) {

for( l=0; l<kernelwidth;l++ ) {

fSum += (SrcImage[i+k][j+l]*kernel[k][l]); }

}

fSum += 127;

if( fSum>255 ) fSum = 255;

if( fSum<0) fSum = 0;

dst[nStartIndex +IMUL(nCenterY,width) + nCenterX] = fSum;

}

}

// <= Convolution

}

You appear to be copying the image to local memory, which will be very slow. It would be faster to read the image direct from global memory.

Ideally you want to cache tiles of the image in shared memory, and store the kernel in constant memory. Have you looked at the convolutionSeparable sample in the SDK?

/jordyvaneijk/ Ok…thank you…

/Simon Green/ OK i read “convolutionseparable” code

but i think that source is just usefule in separable kernel…right?

but my kernel is not separable…like gaussian…or LoG

and Accessing the image direct from global memory was slower than localmemory copied…

because convolution must access the memory so many times.