matrix column sums

Hello, I implemented a kernel to get the sum of each row of a matrix. The kernel works for little matrices … but when i use some matrix of 3000 x 3000 elements the reults seem to be nan.

The kernel configuration rules to do every row of blocks to sum up one row. This code works fine with a 100x100 matrix.

The Host Code is

// d_dim is the matrix dimension

// maxThreads is the maximum thread number per block ( usually 512 )

		for( int i = 0; i < something; i++ )

		{

// ... manipulation of the matrix

			sum_DimBlock.x = ( d_dim < maxThreads << 1 ) ? (int)ceil( d_dim / 2.0 ) : maxThreads;

			sum_DimGrid.x = (int)ceil( d_dim / (float)( sum_DimBlock.x << 1 ));

			sum_DimGrid.y = d_dim;

			sums<<< sum_DimGrid, sum_DimBlock, sizeof( float ) * sum_DimBlock.x>>>( i, d_dim );

			while( sum_DimGrid.x > 1 )

			{

				n = sum_DimGrid.x;

				sum_DimBlock.x = ( n < maxThreads << 1 ) ? (int)ceil( n / 2.0 ) : maxThreads;

				sum_DimGrid.x = (int)ceil( n / (float)( sum_DimBlock.x << 1 ));

				s<<< sum_DimGrid, sum_DimBlock, sizeof( float ) * sum_DimBlock.x >>>( i, n );

			}

		}

...

The device code looks like ( results and T_d_cPtr are in constant memory and pointers to linear device memory ):

__global__ void sums( int plane, int n )

{

	extern __shared__ float T_s[];

	unsigned int ix = blockIdx.x * blockDim.x + threadIdx.x;

 	unsigned int i = blockIdx.y * d_dim_c + ix;

	unsigned int b = gridDim.x * blockDim.x;

	

	T_s[ threadIdx.x ] = T_d_cPtr[ i ] + ( ix + b < n ? T_d_cPtr[ i + b ] : 0 );

	__syncthreads();

	unsigned int s_old = blockDim.x;

	for( int s = blockDim.x >> 1; s > 0; s >>= 1 )

	{

		if( threadIdx.x < s )

		{

			T_s[ threadIdx.x ] += T_s[ threadIdx.x + s ];

			if( threadIdx.x == 0 && s_old != s << 1 ) // if dim is not even, take the last element

			{

				T_s[ 0 ]  += T_s[ s_old - 1 ];

			}

		}

		s_old = s;

		__syncthreads();

	}

	if( threadIdx.x == 0 )

	{

		if( gridDim.x == 1 )

		{

			results[ blockIdx.y * target_dim_c + plane ] = T_s[0];

		}

		else

		{

			T_d_cPtr[ blockIdx.y * d_dim_c + blockIdx.x ] = T_s[0];

		}

	}

}

Does anybody see a mistake?