Hello,
I’m trying to run a cuda program that uses atomic operations. The idea is to use a global variable “global_offset” to compute offsets for each block (that will serve to copy data to a global structure).
The problem is when the number of blocks is greater than 128, the value of “global_offset” changes and becomes invalid which gives incorrect results.
here is a portion of the program
//kernel forward_exploration
__global__ void forward_exploration(unsigned int *frontier_length, unsigned int *new_frontier_length, unsigned int *block_offsets, unsigned int *global_offset)
{
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
int thid = threadIdx.x;
block_offsets = new unsigned int[gridDim.x];
extern __shared__ unsigned int Buffer[];
unsigned int *frontier_offsets = &Buffer[0];
frontier_offsets[thid] = 0;
__shared__ unsigned int block_offset;
__syncthreads();
...
//construct shared frontier offsets (block level)
//prefix sum shared frontier_offsets
int offset = 1;
for (int d = f_length >> 1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset * ((thid << 1) + 1) - 1;
int bi = offset * ((thid << 1) + 2) - 1;
ai += CONFLICT_FREE_OFFSET(ai);
bi += CONFLICT_FREE_OFFSET(bi);
frontier_offsets[bi] += frontier_offsets[ai];
}
offset <<= 1;
}
__syncthreads();
if (thid == 0)
{
block_offset = 0;
//Save the total sum on the global block sums array
//Then clear the last element on the shared memory
if (gridDim.x == 1)
{
*new_frontier_length = frontier_offsets[f_length - 1];
}else
{
block_offsets[blockIdx.x] = frontier_offsets[f_length - 1 + CONFLICT_FREE_OFFSET(f_length - 1)];
}
frontier_offsets[blockDim.x] = frontier_offsets[f_length - 1 + CONFLICT_FREE_OFFSET(f_length - 1)];
frontier_offsets[f_length - 1] = 0;
}
for (int d = 1; d < f_length; d <<= 1) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset * ((thid << 1) + 1) - 1;
int bi = offset * ((thid << 1) + 2) - 1;
ai += CONFLICT_FREE_OFFSET(ai);
bi += CONFLICT_FREE_OFFSET(bi);
unsigned int t = frontier_offsets[ai];
frontier_offsets[ai] = frontier_offsets[bi];
frontier_offsets[bi] += t;
}
}
__syncthreads();
//prefix sum global block_offsets (more then one block)
if (thid == 0 && gridDim.x > 1)
{
//using atomic operations
printf("%d : global_offset = %d\n",tid , *global_offset);
printf("%d : block_offsets[[blockIdx.x] = %d\n",tid , block_offsets[blockIdx.x]);
block_offset = atomicAdd(global_offset, block_offsets[blockIdx.x]);
*new_frontier_length = *global_offset;
printf("%d : new_frontier_length = %d (%d) %d\n",tid , *new_frontier_length, *global_offset, block_offset);
}
__syncthreads();
}
And the main function:
int main(int argc, char* argv[])
{
...
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//memory allocation
unsigned int *frontier_fw_length;
CHECK(cudaMalloc ((void**) &frontier_fw_length, sizeof(unsigned int)));
CHECK(cudaMemset(frontier_fw_length, 0, sizeof(unsigned int)));
unsigned int *new_frontier_length;
CHECK(cudaMalloc ((void**) &new_frontier_length, sizeof(unsigned int)));
CHECK(cudaMemset(new_frontier_length, 0, sizeof(unsigned int)));
unsigned int *block_offsets;
CHECK(cudaMalloc ((void**) &block_offsets, sizeof(unsigned int)));
CHECK(cudaMemset(block_offsets, 0, sizeof(unsigned int)));
unsigned int *global_offset;
CHECK(cudaMalloc ((void**) &global_offset, sizeof(unsigned int)));
CHECK(cudaMemset(global_offset, 0, sizeof(unsigned int)));
unsigned int grid_size = (int)ceil((frontier_length)/32.0);
printf("launching forward_exploration <<<%d,32>>> (%d)\n", grid_size, frontier_length);
cudaEventRecord(start);
forward_exploration<<<grid_size,32,33*sizeof(unsigned int)>>>(frontier_fw_length, new_frontier_length, block_offsets, global_offset); //size of frontier_offsets = number of threads + 1 to save the last number
cudaEventRecord(stop);
cudaEventSynchronize(stop);
std::cout << "exploration terminated" << std::endl;
}
And here are some result I get from the execution
I can seem to find a solution for this problem. Does anyone know what the problem could be, or what I did wrong?
Thanks for your help,
Mira