I need to sum columns of a matrix (2 dimensional array) with reduce algorithm on CUDA. I searched and found https://stackoverflow.com/questions/21428378/reduce-matrix-columns-with-cuda, but results are not as I expected.
I have:
row one: | 0| 1| 2| 3| 4|
row two: | 5| 6| 7| 8| 9|
row three: |10|11|12|13|14|
row four: |15|16|17|18|19|
I need to obtain:
row one: |30|34|38|42|46|
my CUDA code:
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <time.h>
#include "Header.cuh"
__global__ void shared_reduce_kernel(float * d_out, const float * d_in)
{
extern __shared__ float sdata[];
int tid = threadIdx.x+blockDim.x*blockIdx.x;
if(tid<UEs)
{
sdata[tid] = d_in[tid];
}
__syncthreads();
for( int offset = blockDim.x/2; offset > 0; offset >>= 1 )
{
if(threadIdx.x < offset)
{
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
__syncthreads();
}
if(threadIdx.x == 0)
{
d_out[blockIdx.x] = sdata[0];
}
}
int main()
{
float array[UEs][FFT_size];
for (int a = 0; a<UEs; a++)
{
for (int b = 0; b<FFT_size; b++)
{
array[a][b] = b+a*FFT_size;
printf("array[%d][%d] = %.2f; ", a, b, array[a][b]);
}
printf("\n");
}
printf("\n");
float *d_out;
cudaMalloc((void**)&d_out, FFT_size*sizeof(float));
float *d_in;
cudaMalloc((void**)&d_in, FFT_size*UEs*sizeof(float));
cudaMemcpy(d_in, array, FFT_size*UEs*sizeof(float), cudaMemcpyHostToDevice);
shared_reduce_kernel << < UEs, FFT_size, FFT_size*UEs*sizeof(float) >> >(d_out, d_in);
float *out;
out = (float *)malloc(FFT_size*sizeof(float));
cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
//printf("\n");
for(int c = 0; c < FFT_size; c++)
{
printf("sum of all elements in a column %d: %.2f\n", c, out[c]);
}
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
Header file:
#define UEs 3
#define FFT_size 4
Thank you