I write a function to generate a 6x6 matrix in every thread and I want to sum up them in the function, then I find that the result will be influenced by the amount of syncthread(),here is my code
__shared__ float dim_shared1[256];
int tx = threadIdx.x , ty = threadIdx.y ,tid = tx*16+ty;
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int u = x * downsample;
int v = y * downsample;
bool is_valid = true;
Matrix6f new_H;
Matrix6x1f new_b;
new_H.setZeros();
new_b.setZeros();
is_valid = computeAHSemanticTrack(u,v,voxelData,voxelIndex,huber_constant,label,depth,transform,sensor_rows,
sensor_cols, downsample, sceneIntrinsics, sceneParams, gaussian_weight, kernel_size,
residuals_threshold,new_H,new_b); // function to generate matrix
if(!is_valid){
new_H.setZeros();
new_b.setZeros();
}
__syncthreads();
for(int ii = 0 ; ii < 6 ; ii ++) for(int jj = 0 ; jj < 6 ; jj ++){
dim_shared1[tid] = new_H(ii,jj);
__syncthreads();
for(int s = 128;s>0;s>>=1){
if(tid < s){
dim_shared1[tid] += dim_shared1[tid+s];
}
__syncthreads();
// __syncthreads();
// __syncthreads();
// __syncthreads();
// __syncthreads();
// __syncthreads();
// __syncthreads();
// __syncthreads();
// __syncthreads();
// __syncthreads(); here if all the __syncthreads() run,the answer is right
}
__syncthreads();
// __syncthreads();
// __syncthreads();
if(tid == 0){
atomicAdd(&acc_H->at(ii,jj),dim_shared1[0]);
}
__syncthreads();
}
if there is only one __syncthreads() the result will be totally wrong.
By the way , If I use atomicAdd to sum up all matrix ,the answer is right, so matrix generation function is right.
What’s the problem? My GPU is Titan Xp and CUDA version is 10.0.