I’ve a problem running this code on the Quadro FX1700 card, it fault with a “unispecified launch failure” when i call the cudaThreadSyncornize() function after the cufft. Is it caused by fft dimension? in the code nazft is 4096. If I don’t call the first cudaThreadSyncronize() in the code, the same error occurs on the cudaThreadSyncronize() after the kernel.
cufftHandle plan;
cufftResult rst = CUFFT_SAFE_CALL(cufftPlan1d(&plan, nazft, CUFFT_C2C,1));
checkCUDAError("cudaPlan calls");
size_t size1, size2;
int BLOCK_SIZE;
int GRID_SIZE;
BLOCK_SIZE=512;
GRID_SIZE=8;
cufftComplex *rcDev;
cufftComplex *tfftwDev;
dim3 dimBlock(BLOCK_SIZE);
dim3 dimGrid(GRID_SIZE);
//allocate device memory
size1 = nazft* sizeof(cufftComplex); //dimensione della struttura rc
size2 = nazft* sizeof(cufftComplex);//dimensione della struttura tfftw
cudaMalloc((void**)&rcDev,size1);
checkCUDAError("cudaMalloc1 calls");
cudaMalloc((void**)&tfftwDev,size2);
checkCUDAError("cudaMalloc2 calls");
for (i=0; i < nvp; i++){
//host to device memory copy
cudaMemcpy(rcDev, rc[i], size1, cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy1 calls");
cudaMemcpy( tfftwDev, tfftw, size2, cudaMemcpyHostToDevice );
checkCUDAError("cudaMemcpy2 calls");
CUFFT_SAFE_CALL(cufftExecC2C(plan,&rcDev[i],(cufftComplex *) tfftwDev, CUFFT_FORWARD));
checkCUDAError("cudaExecCUFFT call");
cudaThreadSynchronize();
checkCUDAError("cudaSyncronize calls1");
kernelForwardAzFFT_2 <<< dimGrid, dimBlock >>>( rcDev, tfftwDev);
checkCUDAError("cudaKernel calls");
cudaThreadSynchronize();
checkCUDAError("cudaSyncronize calls2");
cudaMemcpy(rc[i],rcDev,size1,cudaMemcpyDeviceToHost);
cudaMemcpy(tfftw,tfftwDev,size2,cudaMemcpyDeviceToHost);
}
CUFFT_SAFE_CALL(cufftDestroy(plan));
cudaFree(rcDev);
cudaFree(tfftwDev);