I will implement a Sobel Filter, but the destination buffer is the same of the source…

The program run without errors

The code runs very well in debug mode…

void CUDA_SimpleSobel(unsigned char* src, unsigned char* dst, int size)


	//load src1 to the device

	unsigned char* src1D;

	cudaError_t s = cudaMalloc((void**)&src1D, size);

	if (s != cudaSuccess)


		printf("Error allocating Sobel src\n");


	s = cudaMemcpy(src1D, src, size, cudaMemcpyHostToDevice);

	if (s != cudaSuccess)


		printf("Error copying Sobel src\n");



	// Allocate destination to the device memory

	unsigned char* dstD;

	s = cudaMalloc((void**)&dstD, size);

	if (s != cudaSuccess)


		printf("Error allocating Sobel dst\n");



	dim3 dimBlock(LINE_SIZE);

	dim3 dimGrid(size / dimBlock.x);


	//cudaEventRecord(start, 0);

	//CUDA_SobelImpl<<<dimGrid, dimBlock>>>(src1D, dstD);


	cudaMemcpy(dst, dstD, size, cudaMemcpyDeviceToHost);

	//cudaEventRecord(stop, 0);


	// Free device memory




__global__ void CUDA_SobelImpl(unsigned char* src, unsigned char* dst)


	//Block index

	int bx = blockIdx.x;

	// Thread index

	int tx = threadIdx.x;

	dst[tx + (bx + 1) * LINE_SIZE] = 0;


	if (bx >= 144 - 1)




	unsigned char* line1;

	unsigned char* line2;

	unsigned char* line3;

	line1 = src + tx + bx * LINE_SIZE;

	line2 = src + tx + (bx + 1) * LINE_SIZE;

	line3 = src + tx + (bx + 2) * LINE_SIZE;

//	if (tx < LINE_SIZE - 1)

		dst[tx + (bx + 1) * LINE_SIZE] = ComputeSobel(line1[0], line1[1], line1[2], 

									line2[0], line2[1], line2[2], 

									line3[0], line3[1], line3[2]);


__device__ unsigned char ComputeSobel(unsigned char ul, // upper left

									 unsigned char um, // upper middle

									 unsigned char ur, // upper right

									 unsigned char ml, // middle left

									 unsigned char mm, // middle (unused)

									 unsigned char mr, // middle right

									 unsigned char ll, // lower left

									 unsigned char lm, // lower middle

									 unsigned char lr)


	short horz = ur + 2*mr + lr - ul - 2*ml - ll;

	short vert = ul + 2*um + ur - ll - 2*lm - lr;

	short sum = (short)(abs(horz) + abs(vert)) / 2;

	sum = (sum > 0xFF) ? 0xFF : sum;

	return (unsigned char)sum;


I wonder why don't you use the original example code from an SDK browser.

I wonder why don’t you use the original example code from an SDK browser.

CUDA Manual (section says that all external variables in the shared memory

start with the same address - do you think it is related to your problem?

Another thing that I see is that

size / dimBlock.x

can underestimate the number of blocks that you need since it is an integer division…