CUFFT ThreadSyncronize and unispecified launch failure

I’ve a problem running this code on the Quadro FX1700 card, it fault with a “unispecified launch failure” when i call the cudaThreadSyncornize() function after the cufft. Is it caused by fft dimension? in the code nazft is 4096. If I don’t call the first cudaThreadSyncronize() in the code, the same error occurs on the cudaThreadSyncronize() after the kernel.

cufftHandle plan;

	cufftResult rst = CUFFT_SAFE_CALL(cufftPlan1d(&plan, nazft, CUFFT_C2C,1));

	 checkCUDAError("cudaPlan calls");

	

	 size_t size1, size2;

	  int BLOCK_SIZE;

	  int GRID_SIZE;

	  

	  BLOCK_SIZE=512;

	  GRID_SIZE=8;

	   

	  cufftComplex *rcDev;

	  cufftComplex *tfftwDev;

	   	

	  dim3 dimBlock(BLOCK_SIZE);

	  dim3 dimGrid(GRID_SIZE);

	 

	  //allocate device memory

	  size1 = nazft* sizeof(cufftComplex); //dimensione della struttura rc  

	  size2 = nazft* sizeof(cufftComplex);//dimensione della struttura tfftw

	 

	  cudaMalloc((void**)&rcDev,size1);

	  checkCUDAError("cudaMalloc1 calls");

	  cudaMalloc((void**)&tfftwDev,size2);

	  checkCUDAError("cudaMalloc2 calls");

for (i=0; i < nvp; i++){

//host to device memory copy

	  cudaMemcpy(rcDev, rc[i], size1, cudaMemcpyHostToDevice);

	   checkCUDAError("cudaMemcpy1 calls");

	  cudaMemcpy( tfftwDev, tfftw, size2, cudaMemcpyHostToDevice ); 

	   checkCUDAError("cudaMemcpy2 calls");  

		CUFFT_SAFE_CALL(cufftExecC2C(plan,&rcDev[i],(cufftComplex *) tfftwDev, CUFFT_FORWARD));

		checkCUDAError("cudaExecCUFFT call");

		cudaThreadSynchronize(); 

		checkCUDAError("cudaSyncronize calls1"); 

		kernelForwardAzFFT_2 <<< dimGrid, dimBlock >>>( rcDev, tfftwDev);

		checkCUDAError("cudaKernel calls");

	   cudaThreadSynchronize();

	   checkCUDAError("cudaSyncronize calls2"); 

 cudaMemcpy(rc[i],rcDev,size1,cudaMemcpyDeviceToHost);

 cudaMemcpy(tfftw,tfftwDev,size2,cudaMemcpyDeviceToHost);	

}

		 CUFFT_SAFE_CALL(cufftDestroy(plan));

 	  cudaFree(rcDev);	

		  cudaFree(tfftwDev);

up