CG solution for sm 70 and newer. it uses labeled_partition to group threads in a warp by your index. Programming Guide :: CUDA Toolkit Documentation
#include <iostream>
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
namespace cg = cooperative_groups;
__global__
void kernel(float* output, const int* indices, const float* input){
#if __CUDA_ARCH__ >= 700
auto warp = cg::tiled_partition<32>(cg::this_thread_block());
const float value = input[threadIdx.x];
const int index = indices[threadIdx.x];
auto sameindexgroup = cg::labeled_partition(warp, index);
float reduced = cg::reduce(sameindexgroup,value, cg::plus<float>{});
if(sameindexgroup.thread_rank() == 0){
atomicAdd(output + index, reduced);
}
#endif
}
int main(){
float* output; cudaMallocHost(&output, sizeof(float) * 32);
int* indices; cudaMallocHost(&indices, sizeof(int) * 32);
float* input; cudaMallocHost(&input, sizeof(float) * 32);
for(int i = 0; i < 32; i++){
indices[i] = i % 4;
input[i] = 1;
output[i] = 0;
}
kernel<<<1,32>>>(output, indices, input);
cudaDeviceSynchronize();
for(int i = 0; i < 32; i++){
std::cout << output[i] << " ";
}
std::cout << "\n";
}