Getting black(0) result in memory, with no error reported

I am trying to inmplement NLM denoising(similar to the one in the NVIDIA samples).

I have a strange problem.

If I set the search window radius into 10 (diameter 21) then everything works and I get a reasonable result.

If I set the search window radius into 20 (diameter 41) then I get a black image (0) as the result. Which shouldn’t happen.

Perhaps there is something I am missing about resources usage.

I have enough registers to use the threads, and I don’t use shared memory.

SearchSize is the window radius.

Here is my code(hope this is not too much code):

extern "C" void

ResizeImagePlain(unsigned char * Sequence, int Width, int Height, int Depth, unsigned char * Result)

{

	int DepthSearch = 1;

	int SearchSize = 20;

	int CompareSize = 6;

	int n = Width*Height*sizeof(unsigned char);

	unsigned char* GPUSequence;

	CUDA_SAFE_CALL(cudaMalloc((void**) &GPUSequence, n*DepthSearch));

	for (int i=0; i<DepthSearch; i++)

		CUDA_SAFE_CALL(cudaMemcpy(GPUSequence+i*n, Sequence+i*n, n,

								  cudaMemcpyHostToDevice) );

	unsigned char * GPUResult;

	CUDA_SAFE_CALL(cudaMalloc((void**) &GPUResult, n));

	dim3 grid(Width/10, Height/10);

	dim3 threads(10, 10);

	ResizeNLMPlain<<< grid, threads >>>(1, Width, Height, DepthSearch, CompareSize, SearchSize, GPUSequence, GPUResult);

				cudaError_t err = cudaGetLastError();									

			if( cudaSuccess != err) {												

				char * s = (char *)malloc (sizeof(char)*256);

				sprintf(s, "Cuda error: %s in file '%s' in line %i : %s.\n",	

						"a", __FILE__, __LINE__, cudaGetErrorString( err) );

				MessageBox (NULL, s, "CUDA Error", MB_OK);

				exit(EXIT_FAILURE);												  

			}

	CUDA_SAFE_CALL(cudaMemcpy(Result, GPUResult, n,

							  cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaFree(GPUSequence));

	CUDA_SAFE_CALL(cudaFree(GPUResult));

}

__global__ void

ResizeNLMPlain (int Scale, int Width, int Height, int Depth, int CompareWindow, int SearchWindow, unsigned char * Sequence, unsigned char * Result)

{

	long tx = (threadIdx.x+blockIdx.x*10);

	long ty = (threadIdx.y+blockIdx.y*10);

/*	Result[tx*Height+ty] = Sequence[tx+ty*Width];

	return;*/

/*	StartX = ((StartX/Scale)*Scale)+(Scale/2);

	StartY = ((StartY/Scale)*Scale)+(Scale/2);

	EndX = ((EndX/Scale)*Scale)+(Scale/2);

	EndY = ((EndY/Scale)*Scale)+(Scale/2);*/

	float Sum=0;

	float Weight = 0.;

	for (int CountY=-SearchWindow; CountY<=SearchWindow; CountY+=Scale)

		for (int CountX=-SearchWindow; CountX<=SearchWindow; CountX+=Scale)

		{

			int x = max(tx+CountX, 0);

			x = min(x, Width-1);

			int y = max(ty+CountY, 0);

			y = min(y, Height-1);

			float w = 0.;

			for (int y2=-CompareWindow; y2<=CompareWindow; y2++)

			{

				for (int x2=-CompareWindow; x2<=CompareWindow; x2++)

				{

					int cx = max(tx+x2, 0);

					cx = min(cx, Width-1);

					int cy = max(ty+y2, 0);

					cy = min(cy, Height-1);

					int lx = max(x+x2, 0);

					lx = min(lx, Width-1);

					int ly = max(y+y2, 0);

					ly = min(ly, Height-1);

					float a = (float)(Sequence[lx+ ly*Width]-Sequence[cx+cy*Width])/255.;

//					float a = (float)(Sequence[x+ y*Width]-Sequence[tx+ty*Width])/255.;

					w+=a*a;

				}

			}

//			int i = y-((EndY-StartY)/2);

//			int j = x-((EndX-StartX)/2);

//			w = expf( -(sqrt(w) + (i * i + j * j) / ((EndY-StartY)*(EndX-StartX))  ));

			w = 0.001/(w+0.001);

			Sum+=(((float)Sequence[x+y*Width])/255.f)*w;

			Weight+=w;

		}

	Sum/=Weight;

	Result[tx*Height+ty] = (unsigned char)(Sum*255.f);

}

Do you notice anything wrong?