Hello, I implemented a kernel to get the sum of each row of a matrix. The kernel works for little matrices … but when i use some matrix of 3000 x 3000 elements the reults seem to be nan.

The kernel configuration rules to do every row of blocks to sum up one row. This code works fine with a 100x100 matrix.

The Host Code is

```
// d_dim is the matrix dimension
// maxThreads is the maximum thread number per block ( usually 512 )
for( int i = 0; i < something; i++ )
{
// ... manipulation of the matrix
sum_DimBlock.x = ( d_dim < maxThreads << 1 ) ? (int)ceil( d_dim / 2.0 ) : maxThreads;
sum_DimGrid.x = (int)ceil( d_dim / (float)( sum_DimBlock.x << 1 ));
sum_DimGrid.y = d_dim;
sums<<< sum_DimGrid, sum_DimBlock, sizeof( float ) * sum_DimBlock.x>>>( i, d_dim );
while( sum_DimGrid.x > 1 )
{
n = sum_DimGrid.x;
sum_DimBlock.x = ( n < maxThreads << 1 ) ? (int)ceil( n / 2.0 ) : maxThreads;
sum_DimGrid.x = (int)ceil( n / (float)( sum_DimBlock.x << 1 ));
s<<< sum_DimGrid, sum_DimBlock, sizeof( float ) * sum_DimBlock.x >>>( i, n );
}
}
...
```

The device code looks like ( results and T_d_cPtr are in constant memory and pointers to linear device memory ):

```
__global__ void sums( int plane, int n )
{
extern __shared__ float T_s[];
unsigned int ix = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int i = blockIdx.y * d_dim_c + ix;
unsigned int b = gridDim.x * blockDim.x;
T_s[ threadIdx.x ] = T_d_cPtr[ i ] + ( ix + b < n ? T_d_cPtr[ i + b ] : 0 );
__syncthreads();
unsigned int s_old = blockDim.x;
for( int s = blockDim.x >> 1; s > 0; s >>= 1 )
{
if( threadIdx.x < s )
{
T_s[ threadIdx.x ] += T_s[ threadIdx.x + s ];
if( threadIdx.x == 0 && s_old != s << 1 ) // if dim is not even, take the last element
{
T_s[ 0 ] += T_s[ s_old - 1 ];
}
}
s_old = s;
__syncthreads();
}
if( threadIdx.x == 0 )
{
if( gridDim.x == 1 )
{
results[ blockIdx.y * target_dim_c + plane ] = T_s[0];
}
else
{
T_d_cPtr[ blockIdx.y * d_dim_c + blockIdx.x ] = T_s[0];
}
}
}
```

Does anybody see a mistake?