a problem about parallel programming in CUDA

Hi everyone,

I am a beginer of CUDA. I try to use CUDA in image processing. In my project, i want to find the points, which pixels are below 3. Finally i want to get these points’ number and coordinate.

At last, i get the right number(15) and some coordinate but not all of them.
0,0
259,0
666,137
263,206
393,205
947,204
262,208
0,0
0,0
0,0
394,479
0,0
805,488
806,518
806,528
As you can see, i get five 0,0 points. In the picture, the first 0,0 and 259,0 are the same block in my CUDA program. And the next three 0,0 points are the same block with 394,479. And last 0,0 is same block with 805,488.

That means, in one block, defferent threads use the same buffe “d_numCorners”. Different point coordinate was stored in the same place.

So what should I do?

I use nVIDIA Tegra K1, ubuntu 14.04, Nsight Eclipse Edition Version: 6.5 and opencv 2.4.9.

Thank you for your help,
Liang

#include <opencv2/core/cuda_devptrs.hpp>
#include <opencv2/opencv.hpp>
#include "cuda.h"
#include "cuda_runtime_api.h"
#include <stdio.h>
#include <stdlib.h>

typedef struct { int x, y; } xy;
typedef unsigned char uchar;
using namespace cv;
using namespace std;
using namespace cv::gpu;

__global__ void test(const uchar* im, uchar* im_out,int nWidth,int nHeight,int stride,int* d_numCorners, xy* d_corners)
{
//		__shared__ int numCorners_private;
//		numCorners_private = 0;
//		__syncthreads();
	int x = threadIdx.x + blockIdx.x * blockDim.x;
	int y = threadIdx.y + blockIdx.y * blockDim.y;

	if(0<=x && x< nWidth && 0<=y && y < nHeight)
	{
		int tid = y*stride + x;
		if(*(im+tid) < 3)
		{
			d_corners[*d_numCorners].x = x;
			d_corners[*d_numCorners].y = y;
			atomicAdd(d_numCorners, 1);
		}
	}
}

void cuda_caller(const Mat img_1, Mat img_out)
{
	img_out.create(img_1.size(), img_1.type());
	xy  *d_corners;
	uchar* d_img_1, *d_img_out;
	int nWidth = img_1.cols;
	int nHeight = img_1.rows;

	int stride = img_1.cols;
	int *d_numCorners;
	int numCorners;
	int byte = nWidth * nHeight * sizeof(uchar);
	uchar* p = (uchar*) img_1.data;
	unsigned int corners_size = 1000000 * sizeof(xy);
	xy* corners = (xy*)malloc( corners_size );

	cudaMalloc((void**)&d_corners,corners_size);
	cudaMalloc((void**)&d_img_1, byte );
	cudaMalloc((void**)&d_img_out, byte );
	cudaMalloc((void**)&d_numCorners, sizeof(int) );

	cudaMemset( d_numCorners, 0, sizeof( int ) ) ;
	cudaMemset( d_corners, 0, corners_size ) ;
	cudaMemcpy(d_img_1, p, byte, cudaMemcpyHostToDevice);

	dim3 block(32,8);
	dim3 grid((img_1.cols + block.x - 1)/block.x,
			  (img_1.rows + block.y - 1)/block.y);

	test<<<grid,block,0>>>(d_img_1,d_img_out,nWidth,nHeight,stride,d_numCorners,d_corners);

	cudaMemcpy(img_out.data, d_img_out, byte, cudaMemcpyDeviceToHost);
	cudaMemcpy(&numCorners, d_numCorners, sizeof( int ), cudaMemcpyDeviceToHost);
	cudaMemcpy(corners, d_corners, corners_size, cudaMemcpyDeviceToHost);

	cudaFree(d_img_1);
	cudaFree(d_img_out);
	cudaFree(d_numCorners);
	cudaFree(d_corners);


	printf("There are %i Corners\n",numCorners);
	for(int i=0 ; i<numCorners ; i++)
    	printf("%i,%i\n",corners[i].x,corners[i].y);
	namedWindow( "Display window", WINDOW_AUTOSIZE ); 
	imshow( "Display window", img_out);                   

	free(corners);
	cudaDeviceSynchronize();
}

Hi Liang,

There are two small problem in your code:

  1. The number of allocated thread needs to be larger than image pixel number. Or you will miss to handle some pixels.
--- dim3 grid((img_1.cols + block.x - 1)/block.x, (img_1.rows + block.y - 1)/block.y);
+++ dim3 grid((img_1.rows/block.x)+1, (img_1.cols/block.y)+1);
  1. There is a race condition in your kernel code
d_corners[*d_numCorners].x = x;

Threads may have the same d_numCorners value and try to write the same buffer location.

It’s easy to solve by this:

int idx = atomicAdd(d_numCorners, 1);
d_corners[idx].x = x;
d_corners[idx].y = y;

AtomicAdd will force thread to add number sequentially and return the old value.
Since the add is sequential, the return value is unique and you can just re-use it as buffer index to avoid racing issue.

https://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomicadd