Hello everyone,

I continue my discovery about reduction !

Now, I want to do something little special !

I have an array of floats of 4 rows and 8 columns, I want to have , after reduction, an array of 1 row & 8 columns which will contain the sum of every column. For example

```
0 1 2 3 4 5 6 7
8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23
24 25 26 27 28 29 30 31
gives
48 52 56 60 64 68 72 76
```

So I declare 8 blocks that will contain 4 threads each, and I make a normal reduction for every block !

The code I execute on the device is the same than the Reduction #1 example :

```
// Kernel that executes on the CUDA device
__global__ void reduction(float *g_idata, float *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (unsigned int s=1; s < blockDim.x; s *= 2) {
if ( tid % (2*s)==0)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
```

Doing so the block 0 which will compute the 1st column will write its result in g_odata[0], the block 1 will compute the 2nd column and write in g_odata[1] and so on and so on !

Unfortunately it does not work and after lots of memory schematics I’ve made, I think the problem is not on the device code ! (Please correct me if I’m wrong)

So my main code is :

```
float * d_result;
float * d_result1;
unsigned int W = 8;
unsigned int H = 4;
float h_result[W][H];
float h_result1[W][H];
size_t pitch,pitch1;
int temp=0;
int i=0,j=0;
// Matrix filling
for (i=0;i<W;i++)
{
for(j=0;j<H;j++)
{
h_result[i][j]=temp;
temp++;
}
}
// Memory allocation
cudaMallocPitch((void**) &d_result, &pitch, W*sizeof(float), H);
cudaMallocPitch((void**) &d_result1, &pitch1, W*sizeof(float), H);
// Memory copying
cudaMemcpy2D(d_result,pitch,h_result,W*sizeof(float),W*sizeof(float),H,cudaMemcpyHostToDevice);
dim3 threadPerBlock(H,1);
dim3 dimGrid(W,1);
reduction<<<dimGrid,threadPerBlock, H*sizeof(float)>>>(d_result,d_result1);
cudaMemcpy2D(h_result1,W*sizeof(float),d_result1,pitch1,W*sizeof(float),H,cudaMemcpyDeviceToHost);
for(j=0;j<H;j++) for(i=1;i<W;i++) h_result[0][j]+=h_result[i][j];
cout << "Expected : " << endl;
for(j=0;j<H;j++) cout << h_result[0][j] << " ";
cout << endl << "Obtained : " << endl;
for(i=0;i<H;i++) cout << h_result1[0][i] << " ";
cout << endl;
cudaFree(d_result);
cudaFree(d_result);
return 0;
}
```

Could you see any problem in my configuration or in my way of dealing about memory copying or allocation ?!