Warp-level reduction before atomic accumulation

striker159 · November 29, 2022, 6:20pm

CG solution for sm 70 and newer. it uses labeled_partition to group threads in a warp by your index. Programming Guide :: CUDA Toolkit Documentation



#include <iostream>

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
namespace cg = cooperative_groups;

__global__
void kernel(float* output, const int* indices, const float* input){
  #if __CUDA_ARCH__ >= 700
  auto warp = cg::tiled_partition<32>(cg::this_thread_block());
  const float value = input[threadIdx.x];
  const int index = indices[threadIdx.x];
  auto sameindexgroup = cg::labeled_partition(warp, index);
  float reduced = cg::reduce(sameindexgroup,value, cg::plus<float>{});
  if(sameindexgroup.thread_rank() == 0){
    atomicAdd(output + index, reduced);
  }
  #endif
}



int main(){
  float* output; cudaMallocHost(&output, sizeof(float) * 32);
  int* indices; cudaMallocHost(&indices, sizeof(int) * 32);
  float* input; cudaMallocHost(&input, sizeof(float) * 32);

  for(int i = 0; i < 32; i++){
    indices[i] = i % 4;
    input[i] = 1;
    output[i] = 0;
  }

  kernel<<<1,32>>>(output, indices, input);
  cudaDeviceSynchronize();

  for(int i = 0; i < 32; i++){
    std::cout << output[i]  << " ";
  }
  std::cout << "\n";
}