atomicAdd

Hello,

I’m trying to run a cuda program that uses atomic operations. The idea is to use a global variable “global_offset” to compute offsets for each block (that will serve to copy data to a global structure).
The problem is when the number of blocks is greater than 128, the value of “global_offset” changes and becomes invalid which gives incorrect results.
here is a portion of the program

//kernel forward_exploration
__global__ void forward_exploration(unsigned int *frontier_length, unsigned int *new_frontier_length, unsigned int *block_offsets, unsigned int *global_offset)
{
   unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
   int thid = threadIdx.x;
   block_offsets = new unsigned int[gridDim.x];
   extern __shared__ unsigned int Buffer[];
   unsigned int *frontier_offsets = &Buffer[0];
   frontier_offsets[thid] = 0;
   __shared__ unsigned int block_offset;

   __syncthreads();

   ...

   //construct shared frontier offsets (block level) 
   //prefix sum shared frontier_offsets 
   int offset = 1;
   for (int d = f_length >> 1; d > 0; d >>= 1) // build sum in place up the tree
   {
		__syncthreads();
		if (thid < d)
		{	 	
			int ai = offset * ((thid << 1) + 1) - 1;
			int bi = offset * ((thid << 1) + 2) - 1;
			ai += CONFLICT_FREE_OFFSET(ai);
			bi += CONFLICT_FREE_OFFSET(bi); 
			frontier_offsets[bi] += frontier_offsets[ai];
		}
		offset <<= 1;
	}	
	__syncthreads();

	if (thid == 0) 
	{ 
		block_offset = 0;
		//Save the total sum on the global block sums array
		//Then clear the last element on the shared memory
		if (gridDim.x == 1)
		{
			*new_frontier_length = frontier_offsets[f_length - 1];
		}else
		{
			block_offsets[blockIdx.x] = frontier_offsets[f_length - 1 + CONFLICT_FREE_OFFSET(f_length - 1)]; 
		}
		frontier_offsets[blockDim.x] = frontier_offsets[f_length - 1 + CONFLICT_FREE_OFFSET(f_length - 1)];
		frontier_offsets[f_length - 1] = 0;
	}
	
	for (int d = 1; d < f_length; d <<= 1) // traverse down tree & build scan
	{
		offset >>= 1;
		__syncthreads();
		if (thid < d)
		{ 
			int ai = offset * ((thid << 1) + 1) - 1;
			int bi = offset * ((thid << 1) + 2) - 1;
			ai += CONFLICT_FREE_OFFSET(ai);
			bi += CONFLICT_FREE_OFFSET(bi);
			unsigned int t = frontier_offsets[ai];
			frontier_offsets[ai] = frontier_offsets[bi];
			frontier_offsets[bi] += t;
		}
	}
 	__syncthreads();

    //prefix sum global block_offsets (more then one block) 
	if (thid == 0 && gridDim.x > 1)
	{
		//using atomic operations
		printf("%d : global_offset = %d\n",tid , *global_offset);
		printf("%d : block_offsets[[blockIdx.x] = %d\n",tid , block_offsets[blockIdx.x]);
		block_offset = atomicAdd(global_offset, block_offsets[blockIdx.x]); 
		*new_frontier_length = *global_offset;
		printf("%d : new_frontier_length = %d (%d) %d\n",tid , *new_frontier_length, *global_offset, block_offset);
	}
	__syncthreads();
}

And the main function:

int main(int argc, char* argv[]) 
{
   ... 
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);

   //memory allocation
   unsigned int *frontier_fw_length;
   CHECK(cudaMalloc ((void**) &frontier_fw_length, sizeof(unsigned int)));
   CHECK(cudaMemset(frontier_fw_length, 0, sizeof(unsigned int)));
   unsigned int *new_frontier_length;
   CHECK(cudaMalloc ((void**) &new_frontier_length, sizeof(unsigned int)));
   CHECK(cudaMemset(new_frontier_length, 0, sizeof(unsigned int)));
   unsigned int *block_offsets;
   CHECK(cudaMalloc ((void**) &block_offsets, sizeof(unsigned int)));
   CHECK(cudaMemset(block_offsets, 0, sizeof(unsigned int)));
   unsigned int *global_offset;
   CHECK(cudaMalloc ((void**) &global_offset, sizeof(unsigned int)));
   CHECK(cudaMemset(global_offset, 0, sizeof(unsigned int)));
   unsigned int grid_size = (int)ceil((frontier_length)/32.0);
   printf("launching forward_exploration <<<%d,32>>> (%d)\n", grid_size, frontier_length); 
   cudaEventRecord(start);

   forward_exploration<<<grid_size,32,33*sizeof(unsigned int)>>>(frontier_fw_length, new_frontier_length, block_offsets, global_offset);	//size of frontier_offsets = number of threads + 1 to save the last number
   cudaEventRecord(stop);
   cudaEventSynchronize(stop);
   std::cout << "exploration terminated" << std::endl;
}

And here are some result I get from the execution


I can seem to find a solution for this problem. Does anyone know what the problem could be, or what I did wrong?

Thanks for your help,
Mira