Hello. I am trying to make a cuda program work (closest pair brute force) but there seems to be a weird behavior and i cannot trace the problem. The problem is that although the program works for less than 65K elements (confirmed results), If i get it over this the gpu seems to crash(driver restarts) and ofc i get wrong results. The problem seems to be when the program is launched with less blockDim.x * gridDim.x than the number of elements, meaning the stride wont fit the elements in one step. I also get The code is here: (dont mind the __syncthreads() i used most for testing.
__global__ void compare_points_BF( unsigned long long *dev_count, float *gbl_min_dist, point *dev_P) {
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int stride = blockDim.x * gridDim.x;
unsigned int cache_index, current_block, grid_offset = 0;
float local_dist, local_min = * gbl_min_dist;
point temp;
__shared__ point cache[BLOCK_SIZE];
__syncthreads();
while (tid < N){
//Each thread compares its onwn point with the onward rest points.
temp.x = dev_P[tid].x;
temp.y = dev_P[tid].y;
__syncthreads();
//cached blocks of P array stored in shared memory as cache[], current_block corresponds to P's cached sub-array
for ( current_block = blockIdx.x + (grid_offset * gridDim.x); current_block < BLOCKS; current_block++){
//check for boundaries violation within each current block
if (current_block * blockDim.x + threadIdx.x < N){
//fetch data from GLOBAL memory to SHARED memory
//coalesced memory access
cache[threadIdx.x] = dev_P[current_block * blockDim.x + threadIdx.x];
}
// synchronize threads in this block
__syncthreads();
//get the beginning of the cached block or the next point if it is a comparison on the same block, check boundaries of index
for ( current_block == blockIdx.x + (grid_offset * gridDim.x) ? cache_index = threadIdx.x+1 : cache_index=0;
(cache_index < blockDim.x) && (current_block * blockDim.x + cache_index) < N;
cache_index++){
atomicAdd( dev_count, 1);
//calculate distance of current points
local_dist = (cache[cache_index].x - temp.x) * (cache[cache_index].x - temp.x) +
(cache[cache_index].y - temp.y) * (cache[cache_index].y - temp.y);
//Determine the minimum numeric value of the arguments
if (local_dist < local_min)
local_min = local_dist;
atomicMin(gbl_min_dist, local_min);
}
//atomicMin(gbl_min_dist, local_min);
//sync for each block
__syncthreads();
}
//for Arbitrarily P[] Length
tid += stride;
grid_offset ++;
__syncthreads();
}
}
I have overload the atomicADD() for floats, do I have to do the same for atomicCAS() ?