Hi…
I always appreciate yours helps…^^
This time my problem is…
I want to minize the Take Time of Image convolution
but my result is just 328msec
i write my code here
would you check my code and tell me what is the problem…
this is my first code in cuda…so i think many thing is not appropriate…
8800 GTX
Image Size : 4096 x 3072
Kernel Size : 15 X15
Take Time : 328 msec ( CUDA )
1000 msec ( OpenCV )
.cubin file info
name = _Enhance
lmem = 4124
smem = 1068
reg = 12
Here is My Source
////////////////// host code
dim3 dimGrid(8,32, 1);
dim3 dimBlock(16, 4, 1);
_Enhance<<<dimGrid, dimBlock>>>(devSrc, devDst, width, height, devKernel, kernelwidth,kernelheight);
////////////////// device code
global void _Enhance(unsigned char src, unsigned char dst, int width, int height, float _kernel, int kernelwidth, int kernelheight)
{
const int nWindowX = _iAlignUp(width, (blockDim.xgridDim.x))/(blockDim.xgridDim.x);
const int nWindowY = _iAlignUp(height,(blockDim.ygridDim.y))/(blockDim.y*gridDim.y);
const int nStartX = (threadIdx.x + blockIdx.x*blockDim.x)*nWindowX; const int nStartY = (threadIdx.y + blockIdx.y*blockDim.y)*nWindowY;
const int nCenterX = kernelwidth/2; const int nCenterY = kernelheight/2;
int nConvStartX, nConvStartY, i, j, l, k, nStartIndex;
float fSum;
int nConvWidth, nConvHeight;
if( threadIdx.x == (blockDim.x-1) && blockIdx.x == (gridDim.x-1) ) nConvWidth = nWindowX-kernelwidth+1;
else nConvWidth = nWindowX;
if( threadIdx.y == (blockDim.y-1) && blockIdx.y == (gridDim.y-1) ) nConvHeight = nWindowY-kernelheight+1;
else nConvHeight = nWindowY;
// Create kernel in Shared Memory =>
__shared__ float kernel[16][16];
for( i=0; i<kernelheight; i++) {
for( j=0; j<kernelwidth; j++ ) {
kernel[i][j] = _kernel[i*kernelwidth+j];
}
}
// <= Create Gaussian kernel in Shared Memory
// Create SrcImage Buffer in Local Memory =>
unsigned char SrcImage[64][64];
for( i=0;i<nConvHeight+kernelheight; i++ ) {
for( j=0;j<nConvWidth+kernelwidth; j++ ) {
SrcImage[i][j] = src[nStartY*width+nStartX + i*width + j];
}
}
// <= Create SrcImage Buffer in Shared Memory
// Convolution =>
for( i=0; i<nConvHeight; i++ ) {
for( j=0; j<nConvWidth; j++ ) {
// Convolution
fSum = 0;
nConvStartX = nStartX + j;
nConvStartY = nStartY + i;
nStartIndex = IMUL(nConvStartY,width) + nConvStartX;
for( k=0; k<kernelheight; k++) {
for( l=0; l<kernelwidth;l++ ) {
fSum += (SrcImage[i+k][j+l]*kernel[k][l]); }
}
fSum += 127;
if( fSum>255 ) fSum = 255;
if( fSum<0) fSum = 0;
dst[nStartIndex +IMUL(nCenterY,width) + nCenterX] = fSum;
}
}
// <= Convolution
}