how to avoid race condition?

hi all,
I am trying to write a program that reads and write in same array. for example my cuda kernel code is as below…

global void floydWarshall_kernel( int* array, int n)
{
int k;

int n=numberOfNodes;
int j = threadIdx.x;
int i = threadIdx.y;

if((i<n)&&(j<n)&&(i!=j))
{

for (k = 0; k < n; ++k) 
{
	int ij,ik, kj;
	ij=graph[j*n+i];
	ik=graph[k*n+i];
	kj=graph[j*n+k];
	if ((ik * kj!= 0) && (i != j))
		if ((ij>=ik+ kj) || (ij == 0))
					graph[j*n+i]= ik+kj;
}

}
}

with this kernel i am not getting the expected result. I guess there is some race condition among threads that read and write data from same memory address. how to avoid the race condition. I read that using __syncthread() function we can avoid race condition. Can anyone clarify how and where to use the function in above kernel? Also the function synchronizes threads only within the same blocks. how to avoid race conditions between threads that resides in different blocks?

Any suggestions would be highly appreciated.

Thanks

try to add __syncthread() after each read or write of memory

HI xuleimath,

I added __syncthreads() as per ur suggestion. But still it is not working.

global void floydWarshall_kernel( int* array, int n)

{

int k;

int n=numberOfNodes;

int j = threadIdx.x;

int i = threadIdx.y;

if((i<n)&&(j<n)&&(i!=j))

{

for (k = 0; k < n; ++k)

{

int ij,ik, kj;

ij=graph[j*n+i]; __syncthreads();

ik=graph[k*n+i];__syncthreads();

kj=graph[j*n+k];__syncthreads();

if ((ik * kj!= 0) && (i != j))

if ((ij>=ik+ kj) || (ij == 0))

graph[j*n+i]= ik+kj;__syncthreads();

}

}

}

the input matrix is:

   0       0       0      16       0       0      16       7       0       0

   8       0      10       0       0       0       0       0       0       0

  10       1       0       0       0       0       4       9       0       0

   0       0       0       0       0       0       0       4       0      16

   0       0       0       0       0       0       0      10       3       0

   0      17       0       0      14       0       0       0       0       3

  16       0       0       0      19       0       0       0       0       0

   0       7      14       0       0       0       1       0       0       0

   0       0       0       0       0       0       0       0       0       0

   0       0       0       0       0       2       0       0       0       0

Expected output:

   0      14      21      16      27      34       8       7      30      32

   8       0      10      24      33      42      14      15      36      40

   9       1       0      25      23      43       4       9      26      41

  19      11      18       0      24      18       5       4      27      16

  25      17      24      41       0      59      11      10       3      57

  25      17      27      41      14       0      25      24      17       3

  16      30      37      32      19      50       0      23      22      48

  15       7      14      31      20      49       1       0      23      47

   0       0       0       0       0       0       0       0       0       0

  27      19      29      43      16       2      27      26      19       0

output obtained:

   0           0           0          16           0           0          16           7           0           0

   8           0          10           0           0           0           0           0           0           0

  10           1           0           0           0           0           4           9           0           0

   0           0           0           0           0           0           0           4           0          16

   0           0           0           0           0           0           0          10           3           0

   0          17           0           0          14           0           0           0           0           3

  16           0           0           0          19           0           0           0           0           0

   0           7          14           0           0           0           1           0           0           0

   0           0           0           0           0           0           0           0           0           0

   0           0           0           0           0           2           0           0           0           0

So there is not any write operation…

Again, __syncthreads() will work for threads within blocks only. how to do avoid race condition due to threads from different blocks?

Change (partition) your algorithm such that different blocks work on different parts of the array.

If that is not possible, consider using atomic instructions for reads and writes, but prepare for slow execution.

Christian

thanks Christian,

now i changed the kernel code like

global void funct1_kernel( int* a, int n, int k)

{

int j = blockIdx.x * BLOCKSIZE+ threadIdx.x;

int i = blockIdx.y * BLOCKSIZE+threadIdx.y;

int ij,ik, kj;

ij=graph[j*n+i];

ik=graph[k*n+i];

kj=graph[j*n+k];

if ((ik * kj!= 0) && (i != j) && (i<n)&& ( j<n) )

if ((ij>=ik+ kj) || (ij == 0))

		{

					a[j*n+i]= ik+kj;

		}

}

and in host part, i did:

.....

 for(int i = 0; i< numberOfNodes ; i++)

 {

funct1_kernel<<<blocks, threads>>>(d_array, n,i);

 }

it is giving me correct result. Also for n=1000, the gpu execution is abt 150 times faster. but as i increase n the, the performance of GPU degrades, for example for n=4000, GPU is only 4 times faster than cpu version. is it due to increased global memory access?? If so how can it be reduced?

suppose you sequential program is

void  floydWarshall_cpu( int *A, int n )

{

	int i, j, k;

	for( i = 0; i < n; i++){

		for( j = 0; j < n; j++){

			if ( i != j ){

				for (k = 0; k < n; ++k){

					int ij,ik, kj;

					ij = A[j*n+i];

					ik = A[k*n+i];

					kj = A[j*n+k];

					if ((ik * kj!= 0) && (i != j))

					if ((ij>=ik+ kj) || (ij == 0))

						A[j*n+i]= ik+kj;

				}// for k

			}	

		}// for j

	}// for i

}

This has race-condition since you has updated original matrix A(s,t) for (s,t) < (i,j).

consider another program

void  floydWarshall_cpu_v2( int *A, int *B,  int n )

{

	int i, j, k;

	for( i = 0; i < n; i++){

		for( j = 0; j < n; j++){

			if ( i != j ){

				for (k = 0; k < n; ++k){

					int ij,ik, kj;

					ij = A[j*n+i];

					ik = A[k*n+i];

					kj = A[j*n+k];

					if ((ik * kj!= 0) && (i != j))

					if ((ij>=ik+ kj) || (ij == 0))

						B[j*n+i]= ik+kj;

				}// for k

			}else{

				B[j*n+i]= A[j*n+i];

			}	

		}// for j

	}// for i

}

Two programs generate different results

//floydWarshall_cpu

   0   14   21   16   35	0	8	7   38   32

   8	0   10   24   43	0   14   15   46   40

   9	1	0   25   23	0	4	9   26   41

   0   11   18	0   41   18	5	4   44   16

   0   17   24   41	0   59   11   10	3   57

  25   17   27   41   14	0   25   24   17	3

  16   30   37   32   19   50	0   23   22   48

  15	7   14   31   20   49	1	0   23   47

   0	0	0	0	0	0	0	0	0	0

  27   19   29   43   16	2   27   26   19	0
//floydWarshall_cpu_v2

   0   14   21   16   35	0	8	7	0   32

   8	0   10   24	0	0   14   19	0	0

   9	1	0   26   23	0	4	9	0	0

   0   11   18	0	0   18	5	4	0   16

   0   17   24	0	0	0   11   10	3	0

  25   17   27	0   14	0	0   24   17	3

  16	0	0   32   19	0	0   29   22	0

  17	7   14	0   20	0	1	0	0	0

   0	0	0	0	0	0	0	0	0	0

   0   19	0	0   16	2	0	0	0	0

So which one is what you want?

as I see, you call __syncthreads inside a data-dependent branch (if((i<n)&&(j<n)&&(i!=j))) , normally the program should hang in this case

or results can be unexpected. In any case __syncthreads should be called in a global scope or inside a branch taken by all threads…

thanx LSChien,
I want the second one. let me try it.