Hi,

I am trying to take the sum over the last 2 dimension of a 3 dimension array A, that is the sum over the irot and ik dimension. I am not sure that I doing this right. Help will be apreciated.

I have attached the code for the kernel below.

Thanks

```
/* summing the 3D matrix in the nrot and nk dimension */
extern "C" __global__ void
psum_23D( float *A, float *partial, float *temp, int npart, int nrot, int nk) {
int ipart = blockIdx.x * blockDim.x + threadIdx.x;
int irot = blockIdx.y * blockDim.y + threadIdx.y;
int ik = blockIdx.z * blockDim.z + threadIdx.z;
int sharedIndex_x = threadIdx.x;
int sharedIndex_y = threadIdx.y;
int sharedIndex_z = threadIdx.z;
//allocating the shared memory array
__shared__ float shared_A[16][16][4];
__shared__ float temp_A[16];
temp[ipart] = 0.0;
while ( ipart < npart ){
while ( irot < nrot ){
while ( ik < nk ){
temp_A[sharedIndex_x] += A[(irot+nrot*ik)*npart+ipart];
ik += blockDim.z * gridDim.z;
}
irot += blockDim.y * gridDim.y;
}
ipart += blockDim.x * gridDim.x;
}
shared_A[sharedIndex_x][sharedIndex_y][sharedIndex_z] = temp_A[sharedIndex_x];
//synchronizing the threads
__syncthreads();
//doing the 1/2 reduction sum at each step
int j = blockDim.y / 2.0;
while ( j != 0 ) {
if ( sharedIndex_y < j ) {
int k = blockDim.z / 2.0;
while ( k != 0 ) {
if ( sharedIndex_z < k ) shared_A[sharedIndex_x][sharedIndex_y][sharedIndex_z] +=
shared_A[sharedIndex_x][sharedIndex_y + j][sharedIndex_z + k];
__syncthreads();
k /= 2;
}
}
__syncthreads();
j /= 2;
}
if ( sharedIndex_y == 0 ) {
if ( sharedIndex_z == 0 ) {
partial[ipart * npart + blockIdx.x] = shared_A[sharedIndex_x][0][0];
}
}
}
```