 # Reduction an array in to 10 elements by addition of elements based on remaining the indexes to 10

I have an float array with size of 1 million
which I need to add the elements with same remaining in division to 10, I found reduction function but I should have 10 output instead of 1. To explain better, I need to parallelism the below function: (it’s not completely true… just I write that to explain)

``````for(int i = 0; i < size; i++) {
if (i%10 == 0)
output += input[i];

if (i%10 == 1)
output += input[i];

if (i%10 == 2)
output += input[i];

if (i%10 == 3)
output += input[i];

if (i%10 == 4)
output += input[i];

if (i%10 == 5)
output += input[i];

if (i%10 == 6)
output += input[i];

if (i%10 == 7)
output += input[i];

if (i%10 == 8)
output += input[i];

if (i%10 == 9)
output += input[i];
}
``````
``````#include <cuda_runtime.h>
#include <device_launch_parameters.h>

__global__ void reduce_mod10(float* output, const float* input, unsigned int size) {
for ( unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x ) {
}
}

#include <iostream>
#include <numeric>

int main() {
using namespace std;

unsigned int N = 1000 * 1000;
float* input;
cudaMallocManaged(&input, N*sizeof(float));
iota(input, input+N, 0.0f); // 0.0, 1.0, 2.0, ...

float* output;
cudaMallocManaged(&output, 10*sizeof(float));
fill_n(output, 10, 0.0f); // fill 0.0

reduce_mod10<<<(N+255)/256,256>>>(output,input,N);