Hi there, the following code brings my Dell Precision 7500 Box ( with a Tesla C2050 ) to a complete halt. The only thing left I can do is to press to power button to turn off the system. I cannot see what is wrong with this code. Despite the fact it’s the worst implementation of a simple convolution algorithm. It only runs one thread total. Here is the code:
[codebox]
#include <cuda.h>
const unsigned int SRC_N = 10000;
const unsigned int DST_N = 2 * SRC_N - 1;
global void convolve_gpu_simple( float* src_1
, float* src_2
, float* dst
)
{
for( int i = 0; i < SRC_N; ++i )
{
for( int j = 0; j < SRC_N; ++j )
{
dst[ i + j ] += src_1[i] * src_2[j];
}
}
}
void check_for_error( cudaError_t err )
{
if( err != cudaSuccess )
{
const char* error_msg = cudaGetErrorString( err );
exit( 1 );
}
}
void convolve_cpu( float* a, float* b, float* c )
{
for( int i = 0; i < SRC_N; ++i )
{
for( int j = 0; j < SRC_N; ++j )
{
c[ i + j ] += a[i] * b[j];
}
}
}
bool compare_results( float* a, float* b )
{
unsigned int m = 0;
for( int i = 0; i < DST_N; ++i )
{
int v_a = *(( unsigned int* )&a[i]);
int v_b = *(( unsigned int* )&b[i]);
int ulp = abs( v_a - v_b );
m = max( m, ulp );
if( ulp > 5 )
return false;
}
return true;
}
using namespace std;
int main(int argc, char** argv)
{
srand( 0 );
float *src_1, *src_2, *dst_cpu, *dst_gpu;
float *dev_src_1, *dev_src_2, *dev_dst;
src_1 = new float[ SRC_N ];
src_2 = new float[ SRC_N ];
dst_cpu = new float[ DST_N ];
dst_gpu = new float[ DST_N ];
check_for_error( cudaMalloc( &dev_src_1, SRC_N * sizeof( float )));
check_for_error( cudaMalloc( &dev_src_2, SRC_N * sizeof( float )));
check_for_error( cudaMalloc( &dev_dst , DST_N * sizeof( float )));
fill( src_1, src_1 + SRC_N, 1.0f );
fill( src_2, src_2 + SRC_N, 1.0f );
fill( dst_cpu, dst_cpu + DST_N, 0.0f );
fill( dst_gpu, dst_gpu + DST_N, 0.0f );
convolve_cpu( src_1, src_2, dst_cpu );
check_for_error( cudaMemcpy( dev_src_1, src_1, SRC_N * sizeof( float ), cudaMemcpyHostToDevice ));
check_for_error( cudaMemcpy( dev_src_2, src_2, SRC_N * sizeof( float ), cudaMemcpyHostToDevice ));
convolve_gpu_simple<<< 1, 1 >>>( dev_src_1, dev_src_2, dev_dst );
check_for_error( cudaGetLastError() );
check_for_error( cudaMemcpy( dst_gpu, dev_dst, DST_N * sizeof( float ), cudaMemcpyDeviceToHost ));
assert( compare_results( dst_cpu, dst_gpu ));
cudaFree( dev_src_1 );
cudaFree( dev_src_2 );
cudaFree( dev_dst );
delete src_1;
delete[] src_2;
delete[] dst_cpu;
delete[] dst_gpu;
return 0;
}[/codebox]
Any help is very much appreciated. I’m using VS2008 on a Windows 7 x64 box. The NVidia driver is 258.96 and I use CUDA 3.1.
Thanks,
Christian