constexpr int kTargetBlocks = 2; constexpr int kBlockDimX = 32; __shared__ float buffer[2080]; // 2048 no conflicts, interesting. __device__ void OneIteration(float* out_array) { bool odd_warp = threadIdx.z; float out = 0; if (threadIdx.y == 0 && threadIdx.z == 0) { buffer[threadIdx.x] = threadIdx.x; } __syncthreads(); // If I set oddwarp true so everyone goes down the path, then I don't seem to // get conflicts. if (!odd_warp) { out = buffer[threadIdx.x]; } out_array[threadIdx.x] = out; } __global__ void __launch_bounds__(2 * 4 * kBlockDimX, kTargetBlocks) Kernel(float* out_array) { for (int i = 0; i < 1000; ++i) { OneIteration(out_array); } } int main(void) { float* out_array; if (cudaMallocManaged(&out_array, sizeof(float) * 32) != cudaSuccess) { return 1; } dim3 block_dim = {kBlockDimX, 4, 2}; // 400k conflicts reported Kernel<<<4096, block_dim, 0, nullptr>>>(out_array); return 0; }