Appart from running slower than FFTW I also get this on certain hardware
cutilCheckMsg() CUTIL CUDA error: spPreprocessC2C011fftw_kernel<<<, 0, getKernelStream()>>> execution failed
in file </Volumes/Builds/nv/build/rel/gpgpu/toolkit/r3.1/cufft/src/accel/interface/spRealComplex.cu>, line 172 : invalid configuration argument.
It doesn’t happen when I run it on a GTX280, it seems like if I run a kernel and then run CUFFT this error will appear.
Snippet!
cudaMemcpy(devfft1,fft1,sizeof(fftw_real)(fftxffty),cudaMe
mcpyHostToDevice);
cudaMemcpy(devfft2,fft2,sizeof(fftw_real)(fftxffty),cudaMe
mcpyHostToDevice);
cufftResult retval;
retval = cufftExecR2C(*cufft_plan_fwd,devfft1,fft1_c_device);
if(retval!=CUFFT_SUCCESS)
{
printf(“failed to transform fft1 %d\n”,retval);
abort();
}
retval = cufftExecR2C(*cufft_plan_fwd,devfft2,fft2_c_device);
if(retval!=CUFFT_SUCCESS)
{
printf(“failed to transform fft2 %d\n”,retval);
abort();
}
LaunchKernel(fft1_c_device,fft2_c_device,fftw_c_device,fftx,
ffty);
retval = cufftExecC2R(*cufft_plan_back,fftw_c_device, fftw_device); <— this is the one that raises this error
if(retval!=CUFFT_SUCCESS)
{
printf(“failed to transform C2R %d\n”,retval);
abort();
}
LaunchCCKernel(fftw_device,devCC,fftx,ffty);
cudaMemcpy(CC, devCC, sizeof(cufftReal)fftxffty, cudaMemcpyDeviceToHost);
It ony fails on my 330M on the laptop but on the desktop with GTX280, or on other nodes T10, FERMI it works fine… I have no clue
as to what could be wrong but I need to run this on the laptop to get timing data from the profiler, hopefully this also plays into why
its running slow
Regards