sum columns of a 2 dimensional array with Reduce algorithm

I need to sum columns of a matrix (2 dimensional array) with reduce algorithm on CUDA. I searched and found https://stackoverflow.com/questions/21428378/reduce-matrix-columns-with-cuda, but results are not as I expected.

I have:
row one: | 0| 1| 2| 3| 4|
row two: | 5| 6| 7| 8| 9|
row three: |10|11|12|13|14|
row four: |15|16|17|18|19|

I need to obtain:
row one: |30|34|38|42|46|

my CUDA code:

#include "device_launch_parameters.h"
#include "device_functions.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <time.h>
#include "Header.cuh"

__global__ void shared_reduce_kernel(float * d_out, const float * d_in)
{
  extern __shared__ float sdata[];
  int tid = threadIdx.x+blockDim.x*blockIdx.x;
  if(tid<UEs)
  {
    sdata[tid] = d_in[tid];
  }
  __syncthreads();
  
  for( int offset = blockDim.x/2; offset > 0; offset >>= 1 )
  {
    if(threadIdx.x < offset)
    {
        sdata[threadIdx.x] += sdata[threadIdx.x + offset];
    }
    __syncthreads();
  }
  if(threadIdx.x == 0)
  {
    d_out[blockIdx.x] = sdata[0];
  }
}

int main()
{
  float array[UEs][FFT_size];
  for (int a = 0; a<UEs; a++)
  {
    for (int b = 0; b<FFT_size; b++)
    {
      array[a][b] = b+a*FFT_size;
      printf("array[%d][%d] = %.2f; ", a, b, array[a][b]);
    }
    printf("\n");
  }
  printf("\n");

  float *d_out;
  cudaMalloc((void**)&d_out, FFT_size*sizeof(float));
  float *d_in;
  cudaMalloc((void**)&d_in,  FFT_size*UEs*sizeof(float));
  cudaMemcpy(d_in, array, FFT_size*UEs*sizeof(float), cudaMemcpyHostToDevice);

  shared_reduce_kernel << < UEs, FFT_size, FFT_size*UEs*sizeof(float) >> >(d_out, d_in);

  float *out;
  out = (float *)malloc(FFT_size*sizeof(float));
  cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
  //printf("\n");

  for(int c = 0; c < FFT_size; c++)
  {
    printf("sum of all elements in a column %d: %.2f\n", c, out[c]);
  }

  cudaFree(d_in);
  cudaFree(d_out);

  return 0;
}

Header file:

#define UEs 3
#define FFT_size 4

Thank you