System just halts for simple CUDA program

Hi there, the following code brings my Dell Precision 7500 Box ( with a Tesla C2050 ) to a complete halt. The only thing left I can do is to press to power button to turn off the system. I cannot see what is wrong with this code. Despite the fact it’s the worst implementation of a simple convolution algorithm. It only runs one thread total. Here is the code:

[codebox]

#include

#include

#include

#include <cuda.h>

const unsigned int SRC_N = 10000;

const unsigned int DST_N = 2 * SRC_N - 1;

global void convolve_gpu_simple( float* src_1

                               , float* src_2

                               , float* dst

                               )

{

for( int i = 0; i < SRC_N; ++i )

{

    for( int j = 0; j < SRC_N; ++j )

    {

        dst[ i + j ] += src_1[i] * src_2[j];

    }

}

}

void check_for_error( cudaError_t err )

{

if( err != cudaSuccess )

{

    const char* error_msg = cudaGetErrorString( err );

    exit( 1 );

}

}

void convolve_cpu( float* a, float* b, float* c )

{

for( int i = 0; i < SRC_N; ++i )

{

    for( int j = 0; j < SRC_N; ++j )

    {

        c[ i + j ] += a[i] * b[j];

    }

}

}

bool compare_results( float* a, float* b )

{

unsigned int m = 0;

for( int i = 0; i < DST_N; ++i )

{

    int v_a = *(( unsigned int* )&a[i]);

    int v_b = *(( unsigned int* )&b[i]);

int ulp = abs( v_a - v_b );

    m = max( m, ulp );

if( ulp > 5 )

        return false;

}

return true;

}

using namespace std;

int main(int argc, char** argv)

{

srand( 0 );

float *src_1, *src_2, *dst_cpu, *dst_gpu;

float *dev_src_1, *dev_src_2, *dev_dst;

src_1 = new float[ SRC_N ];

src_2   = new float[ SRC_N ];

dst_cpu = new float[ DST_N ];

dst_gpu = new float[ DST_N ];

check_for_error( cudaMalloc( &dev_src_1, SRC_N * sizeof( float )));

check_for_error( cudaMalloc( &dev_src_2, SRC_N * sizeof( float )));

check_for_error( cudaMalloc( &dev_dst  , DST_N * sizeof( float )));

fill( src_1, src_1 + SRC_N, 1.0f );

fill( src_2, src_2 + SRC_N, 1.0f );

fill( dst_cpu, dst_cpu + DST_N, 0.0f );

fill( dst_gpu, dst_gpu + DST_N, 0.0f );

convolve_cpu( src_1, src_2, dst_cpu );

check_for_error( cudaMemcpy( dev_src_1, src_1, SRC_N * sizeof( float ), cudaMemcpyHostToDevice ));

check_for_error( cudaMemcpy( dev_src_2, src_2, SRC_N * sizeof( float ), cudaMemcpyHostToDevice ));

convolve_gpu_simple<<< 1, 1 >>>( dev_src_1, dev_src_2, dev_dst );

check_for_error( cudaGetLastError() );

check_for_error( cudaMemcpy( dst_gpu, dev_dst, DST_N * sizeof( float ), cudaMemcpyDeviceToHost ));

assert( compare_results( dst_cpu, dst_gpu ));

cudaFree( dev_src_1 );

cudaFree( dev_src_2 );

cudaFree( dev_dst );

delete src_1;

delete[] src_2;

delete[] dst_cpu;

delete[] dst_gpu;

return 0;

}[/codebox]

Any help is very much appreciated. I’m using VS2008 on a Windows 7 x64 box. The NVidia driver is 258.96 and I use CUDA 3.1.

Thanks,

Christian

Hi there,

is there anyone who can tell me what the problem might be? This is kinda important for me and my company.

Thanks,

Christian

Hi there,

is there anyone who can tell me what the problem might be? This is kinda important for me and my company.

Thanks,

Christian