I have a little program that reliably crashes - see below. This code is a really whittled down version of my real code, but it illustrates the problem succinctly. This will crash, but after different amounts of run time each time I run it.
I have tried it with drivers 197.13, 197.45 and 196.21.
I have compiled it with version 3.0 of the CUDA toolkit, as well as version 2.3.
I have tried it on two different machines with a 9800 GT, as well as a third machine with some older NVIDIA (integrated) GPU (don’t remember what it was anymore).
I have tried all sorts of ways to play around with it - I can ultimately get it not to crash by allocating WAY more memory for variables in and out then should be needed. Otherwise, I keep getting “unspecified launch failure”.
I have tried all sorts of other things as well - enough to make me suspect this is some driver/windows/cuda bug. Of course, I’d love for someone to tell me how very wrong I am! :)
file is named “crash.cu” and is compiled using the CUDA Build Rule v3.0.14. I set it up to compile for sm_10 and sm_20 (these are the defaults for the Build Rule). The card I mainly use is a 9800 GT, which is sm_11.
using std::cerr;
using std::endl;
#define NUMSAMPLES 100000
void checkCUDAError(cudaError_t err,const char *msg = “”)
{
if( cudaSuccess != err)
{
cerr << "Cuda error: " << msg << ": " << cudaGetErrorString(err) << endl;
exit(EXIT_FAILURE);
}
}
global void kernel_r2c_chirp(float *in,float2 *out,int L)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < L)
{
out[tid].x = in[tid];
out[tid].y = tid;
}
}
int main(void)
{
float *in;
float2 *out;
// allocate arrays on device
checkCUDAError(cudaMalloc((void **) &in, sizeof(float) *NUMSAMPLES),"allocating a_d");
checkCUDAError(cudaMalloc((void **) &out, sizeof(float2)*NUMSAMPLES),"allocating b_d");
for (int THREADS_PER_BLOCK=16;THREADS_PER_BLOCK<=512;THREADS_PER_BLOCK=THREADS_PER_BLOCK+16)
{
int NBLOCKS = int(ceil(double(NUMSAMPLES)/double(THREADS_PER_BLOCK)));
for (int J=0;J<10000;J++)
{
kernel_r2c_chirp <<< NBLOCKS, THREADS_PER_BLOCK >>> (in,out,NUMSAMPLES);
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
cerr << "J = " << J << endl;
cerr << "Cuda error: " << cudaGetErrorString(err) << endl;
exit(EXIT_FAILURE);
}
}
cerr << THREADS_PER_BLOCK << endl;
}
// cleanup
cudaFree(in);
cudaFree(out);
}