Hi all,
I tried to write a little sum function and followed the SDK reduction sample.
The problem is, I always get inf as result.
The kernel function is not optimized, because at first it should work ^^. Here it is…
__global__ void gpu_sum_kernel( float *x_d, float *res_d, int N )
{
unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;
if( gid < N )
{
extern __shared__ float temp_s[];
temp_s[ threadIdx.x ] = x_d[ gid ];
for( int i = blockDim.x / 2; i > 0; i >>= 1 )
{
temp_s[ threadIdx.x ] += temp_s[ threadIdx.x + i ];
__syncthreads();
}
if( threadIdx.x == 0 )
res_d[ blockIdx.x ] = temp_s[ 0 ];
}
}
It try to calculate the sum by calling the kernel again an again with the results from old kernel calls.
float gpu_sum( float *x, int N )
{
int block_size = 128;
int grid_size = (int)ceil( N / (float)block_size );
float *x_d;
int size = sizeof( float ) * N;
cudaMalloc( (void**)&x_d, size );
cudaMemcpy( x_d, x, size, cudaMemcpyHostToDevice );
float res = 0;
float *res_d;
cudaMalloc( (void**)&res_d, grid_size * sizeof( float ) );
checkCudaError( "[cudaMemcpy( H->D )]" );
while( N > 1 )
{
dim3 dimBlock( block_size );
dim3 dimGrid( grid_size );
gpu_sum_kernel<<< dimGrid, dimBlock, sizeof( float ) * block_size >>>( x_d, res_d, N );
checkCudaError( "[kernel]" );
cudaThreadSynchronize();
float *tmp = res_d;
res_d = x_d;
x_d = tmp;
N = grid_size;
grid_size = (int)ceil( N / (float)block_size );
if( grid_size * block_size > N )
{
grid_size = 1;
block_size = N;
}
}
cudaMemcpy( &res, res_d, sizeof( float ), cudaMemcpyDeviceToHost );
checkCudaError( "[cudaMemcpy( D->H )]" );
cudaFree( x_d );
cudaFree( res_d );
return res;
}
I used a similiar function to calculate mean values and noticed more and more increasing results with independent programm calls. Do I have some mistakes in my memory management?
Thanks for looking