constexpr int kTargetBlocks = 2;

constexpr int kBlockDimX = 32;
__shared__ float buffer[2080];  // 2048 no conflicts, interesting.

__device__ void OneIteration(float* out_array) {
  bool odd_warp = threadIdx.z;

  float out = 0;
  if (threadIdx.y == 0 && threadIdx.z == 0) {
    buffer[threadIdx.x] = threadIdx.x;
  }
  __syncthreads();

  // If I set oddwarp true so everyone goes down the path, then I don't seem to
  // get conflicts.
  if (!odd_warp) {
    out = buffer[threadIdx.x];
  }

  out_array[threadIdx.x] = out;
}

__global__ void __launch_bounds__(2 * 4 * kBlockDimX, kTargetBlocks)
    Kernel(float* out_array) {
  for (int i = 0; i < 1000; ++i) {
    OneIteration(out_array);
  }
}

int main(void) {
  float* out_array;
  if (cudaMallocManaged(&out_array, sizeof(float) * 32) != cudaSuccess) {
   return 1;
  }

  dim3 block_dim = {kBlockDimX, 4, 2};
  // 400k conflicts reported
  Kernel<<<4096, block_dim, 0, nullptr>>>(out_array);
  return 0;
}