I am trying to inmplement NLM denoising(similar to the one in the NVIDIA samples).
I have a strange problem.
If I set the search window radius into 10 (diameter 21) then everything works and I get a reasonable result.
If I set the search window radius into 20 (diameter 41) then I get a black image (0) as the result. Which shouldn’t happen.
Perhaps there is something I am missing about resources usage.
I have enough registers to use the threads, and I don’t use shared memory.
SearchSize is the window radius.
Here is my code(hope this is not too much code):
extern "C" void
ResizeImagePlain(unsigned char * Sequence, int Width, int Height, int Depth, unsigned char * Result)
{
int DepthSearch = 1;
int SearchSize = 20;
int CompareSize = 6;
int n = Width*Height*sizeof(unsigned char);
unsigned char* GPUSequence;
CUDA_SAFE_CALL(cudaMalloc((void**) &GPUSequence, n*DepthSearch));
for (int i=0; i<DepthSearch; i++)
CUDA_SAFE_CALL(cudaMemcpy(GPUSequence+i*n, Sequence+i*n, n,
cudaMemcpyHostToDevice) );
unsigned char * GPUResult;
CUDA_SAFE_CALL(cudaMalloc((void**) &GPUResult, n));
dim3 grid(Width/10, Height/10);
dim3 threads(10, 10);
ResizeNLMPlain<<< grid, threads >>>(1, Width, Height, DepthSearch, CompareSize, SearchSize, GPUSequence, GPUResult);
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
char * s = (char *)malloc (sizeof(char)*256);
sprintf(s, "Cuda error: %s in file '%s' in line %i : %s.\n",
"a", __FILE__, __LINE__, cudaGetErrorString( err) );
MessageBox (NULL, s, "CUDA Error", MB_OK);
exit(EXIT_FAILURE);
}
CUDA_SAFE_CALL(cudaMemcpy(Result, GPUResult, n,
cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(GPUSequence));
CUDA_SAFE_CALL(cudaFree(GPUResult));
}
__global__ void
ResizeNLMPlain (int Scale, int Width, int Height, int Depth, int CompareWindow, int SearchWindow, unsigned char * Sequence, unsigned char * Result)
{
long tx = (threadIdx.x+blockIdx.x*10);
long ty = (threadIdx.y+blockIdx.y*10);
/* Result[tx*Height+ty] = Sequence[tx+ty*Width];
return;*/
/* StartX = ((StartX/Scale)*Scale)+(Scale/2);
StartY = ((StartY/Scale)*Scale)+(Scale/2);
EndX = ((EndX/Scale)*Scale)+(Scale/2);
EndY = ((EndY/Scale)*Scale)+(Scale/2);*/
float Sum=0;
float Weight = 0.;
for (int CountY=-SearchWindow; CountY<=SearchWindow; CountY+=Scale)
for (int CountX=-SearchWindow; CountX<=SearchWindow; CountX+=Scale)
{
int x = max(tx+CountX, 0);
x = min(x, Width-1);
int y = max(ty+CountY, 0);
y = min(y, Height-1);
float w = 0.;
for (int y2=-CompareWindow; y2<=CompareWindow; y2++)
{
for (int x2=-CompareWindow; x2<=CompareWindow; x2++)
{
int cx = max(tx+x2, 0);
cx = min(cx, Width-1);
int cy = max(ty+y2, 0);
cy = min(cy, Height-1);
int lx = max(x+x2, 0);
lx = min(lx, Width-1);
int ly = max(y+y2, 0);
ly = min(ly, Height-1);
float a = (float)(Sequence[lx+ ly*Width]-Sequence[cx+cy*Width])/255.;
// float a = (float)(Sequence[x+ y*Width]-Sequence[tx+ty*Width])/255.;
w+=a*a;
}
}
// int i = y-((EndY-StartY)/2);
// int j = x-((EndX-StartX)/2);
// w = expf( -(sqrt(w) + (i * i + j * j) / ((EndY-StartY)*(EndX-StartX)) ));
w = 0.001/(w+0.001);
Sum+=(((float)Sequence[x+y*Width])/255.f)*w;
Weight+=w;
}
Sum/=Weight;
Result[tx*Height+ty] = (unsigned char)(Sum*255.f);
}
Do you notice anything wrong?