Hi All
I use CUDAFFT for 2D convolution and find when the array size of two images combined below 128x128. It works fine but when the size exceeds 128x128, it returns all “1.#QNAN0” in the result array. I thought the limitation will be 16348x16348
Here is my code and I try to free any unused memory immediately
void ConvFFT
(
float* data, int dataH, int dataW, //image and its size
float* mask, int kernelH, int kernelW, //mask and its size
int kernelY, int kernelX, float** Results)//padding size and return result
{
fComplex *d_DataSpectrum;
fComplex *d_KernelSpectrum;
cufftHandle fftPlanFwd;
cufftHandle fftPlanInv;
float *d_Data=NULL;
float *d_PaddedData=NULL;
float *d_Kernel=NULL;
float *d_PaddedKernel=NULL;
float v1=1;
float v2=1;
int datanum = dataH * dataW ;
int datasize=datanum*sizeof(float);
int masknum = kernelH * kernelW;
int masksize=masknum*sizeof(float);
//make it to next power of 2
int fftH = snapTransformSize(dataH+kernelY);
int fftW = snapTransformSize(dataW+kernelX);
int bufnum = fftH*fftW;
int bufsize = bufnum*sizeof(float);
cutilSafeCall( cudaMalloc((void **)&d_Data, datasize) );
cutilSafeCall( cudaMemcpy(d_Data, data, datasize, cudaMemcpyHostToDevice));
cutilSafeCall( cudaMalloc((void **)&d_Kernel, masksize) );
cutilSafeCall( cudaMemcpy(d_Kernel, mask, masksize, cudaMemcpyHostToDevice));
//normalize value, get sq sum of two images
FindSqSumDGPU(d_Kernel, d_Kernel,masknum, v1);
FindSqSumDGPU(d_Data, d_Data,datanum, v2);
v1=1.0f / sqrt(bufnum*bufnum*v1*v2);
//pad data
cutilSafeCall( cudaMalloc((void **)&d_PaddedData, bufsize ));
cutilSafeCall( cudaMemset(d_PaddedData, 0, bufsize));
//pad data at down and right direction with 0
CudaPadArray0BorderDnRt(d_PaddedData,d_Data,fftH,fftW, dataH,dataW, fftH-dataH, fftW-dataW);
cutilSafeCall( cudaThreadSynchronize() );
//free memory
cutilSafeCall( cudaFree(d_Data) );
cutilSafeCall( cudaMalloc((void **)&d_PaddedKernel, bufsize ));
cutilSafeCall( cudaMemset(d_PaddedKernel, 0, bufsize));
CudaPadArray0BorderDnRt(d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, fftH-kernelH, fftW-kernelW);
cutilSafeCall( cudaThreadSynchronize() );
cutilSafeCall( cudaFree(d_Kernel) );
cutilSafeCall( cudaMalloc((void **)&d_KernelSpectrum,(bufnum + fftH) * sizeof(fComplex)));
cufftSafeCall( cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C));
cufftSafeCall( cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum) );
cutilSafeCall( cudaThreadSynchronize() );
cutilSafeCall( cudaFree(d_PaddedKernel));
cutilSafeCall( cudaMalloc((void **)&d_DataSpectrum, (bufnum + fftH) * sizeof(fComplex)));
cufftSafeCall( cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedData, (cufftComplex *)d_DataSpectrum) );
cutilSafeCall( cudaThreadSynchronize() );
cufftSafeCall( cufftDestroy(fftPlanFwd));
CudaModulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftH, fftW, 1,v1);
cutilSafeCall( cudaThreadSynchronize() );
cutilSafeCall( cudaFree(d_KernelSpectrum) );
cufftSafeCall( cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R));
cufftSafeCall( cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, (cufftReal *)d_PaddedData) );
cutilSafeCall( cudaThreadSynchronize());
cutilSafeCall( cudaFree(d_DataSpectrum));
cufftSafeCall( cufftDestroy(fftPlanInv));
cutilSafeCall( cudaMemcpy(*Results, d_PaddedData, bufsize, cudaMemcpyDeviceToHost));
cutilSafeCall( cudaFree(d_PaddedData));
cudaThreadExit();
}