memory limitation in CUFFT memory limitation in device?

Hi All
I use CUDAFFT for 2D convolution and find when the array size of two images combined below 128x128. It works fine but when the size exceeds 128x128, it returns all “1.#QNAN0” in the result array. I thought the limitation will be 16348x16348

Here is my code and I try to free any unused memory immediately

void ConvFFT
(
float* data, int dataH, int dataW, //image and its size
float* mask, int kernelH, int kernelW, //mask and its size
int kernelY, int kernelX, float** Results)//padding size and return result
{
fComplex *d_DataSpectrum;
fComplex *d_KernelSpectrum;
cufftHandle fftPlanFwd;
cufftHandle fftPlanInv;

float *d_Data=NULL;
float *d_PaddedData=NULL;
float *d_Kernel=NULL;
float *d_PaddedKernel=NULL;

float v1=1;
float v2=1;

int datanum = dataH   * dataW ;
int datasize=datanum*sizeof(float);
int masknum = kernelH * kernelW;
int masksize=masknum*sizeof(float);

//make it to next power of 2
int fftH = snapTransformSize(dataH+kernelY);
int fftW = snapTransformSize(dataW+kernelX);

int bufnum  = fftH*fftW;	
int bufsize = bufnum*sizeof(float);	

cutilSafeCall( cudaMalloc((void **)&d_Data,   datasize) );
cutilSafeCall( cudaMemcpy(d_Data,   data, datasize, cudaMemcpyHostToDevice));

cutilSafeCall( cudaMalloc((void **)&d_Kernel, masksize) );
cutilSafeCall( cudaMemcpy(d_Kernel, mask, masksize, cudaMemcpyHostToDevice));	

//normalize value, get sq sum of two images
FindSqSumDGPU(d_Kernel, d_Kernel,masknum, v1);
FindSqSumDGPU(d_Data, d_Data,datanum, v2);	
v1=1.0f / sqrt(bufnum*bufnum*v1*v2);

//pad data
cutilSafeCall( cudaMalloc((void **)&d_PaddedData,     bufsize ));
cutilSafeCall( cudaMemset(d_PaddedData,  0, bufsize));
//pad data at down and right direction with 0
CudaPadArray0BorderDnRt(d_PaddedData,d_Data,fftH,fftW, dataH,dataW, fftH-dataH, fftW-dataW);
cutilSafeCall( cudaThreadSynchronize() );		
//free memory
cutilSafeCall( cudaFree(d_Data) );



cutilSafeCall( cudaMalloc((void **)&d_PaddedKernel,   bufsize ));
cutilSafeCall( cudaMemset(d_PaddedKernel, 0, bufsize));
CudaPadArray0BorderDnRt(d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, fftH-kernelH, fftW-kernelW);	
cutilSafeCall( cudaThreadSynchronize() );	
cutilSafeCall( cudaFree(d_Kernel) );		


cutilSafeCall( cudaMalloc((void **)&d_KernelSpectrum,(bufnum  + fftH) * sizeof(fComplex)));
cufftSafeCall( cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C));		

cufftSafeCall( cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum) );    
cutilSafeCall( cudaThreadSynchronize() );
cutilSafeCall( cudaFree(d_PaddedKernel));


cutilSafeCall( cudaMalloc((void **)&d_DataSpectrum,  (bufnum  + fftH) * sizeof(fComplex)));
cufftSafeCall( cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedData, (cufftComplex *)d_DataSpectrum) );
cutilSafeCall( cudaThreadSynchronize() );

cufftSafeCall( cufftDestroy(fftPlanFwd));

CudaModulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftH, fftW, 1,v1);
cutilSafeCall( cudaThreadSynchronize() );
cutilSafeCall( cudaFree(d_KernelSpectrum) );	

cufftSafeCall( cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R));
cufftSafeCall( cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, (cufftReal *)d_PaddedData) );

cutilSafeCall( cudaThreadSynchronize());
cutilSafeCall( cudaFree(d_DataSpectrum));	
cufftSafeCall( cufftDestroy(fftPlanInv));

cutilSafeCall( cudaMemcpy(*Results, d_PaddedData, bufsize, cudaMemcpyDeviceToHost));	


cutilSafeCall( cudaFree(d_PaddedData));				

cudaThreadExit();

}

Hi All
I use CUDAFFT for 2D convolution and find when the array size of two images combined below 128x128. It works fine but when the size exceeds 128x128, it returns all “1.#QNAN0” in the result array. I thought the limitation will be 16348x16348

Here is my code and I try to free any unused memory immediately

void ConvFFT
(
float* data, int dataH, int dataW, //image and its size
float* mask, int kernelH, int kernelW, //mask and its size
int kernelY, int kernelX, float** Results)//padding size and return result
{
fComplex *d_DataSpectrum;
fComplex *d_KernelSpectrum;
cufftHandle fftPlanFwd;
cufftHandle fftPlanInv;

float *d_Data=NULL;
float *d_PaddedData=NULL;
float *d_Kernel=NULL;
float *d_PaddedKernel=NULL;

float v1=1;
float v2=1;

int datanum = dataH   * dataW ;
int datasize=datanum*sizeof(float);
int masknum = kernelH * kernelW;
int masksize=masknum*sizeof(float);

//make it to next power of 2
int fftH = snapTransformSize(dataH+kernelY);
int fftW = snapTransformSize(dataW+kernelX);

int bufnum  = fftH*fftW;	
int bufsize = bufnum*sizeof(float);	

cutilSafeCall( cudaMalloc((void **)&d_Data,   datasize) );
cutilSafeCall( cudaMemcpy(d_Data,   data, datasize, cudaMemcpyHostToDevice));

cutilSafeCall( cudaMalloc((void **)&d_Kernel, masksize) );
cutilSafeCall( cudaMemcpy(d_Kernel, mask, masksize, cudaMemcpyHostToDevice));	

//normalize value, get sq sum of two images
FindSqSumDGPU(d_Kernel, d_Kernel,masknum, v1);
FindSqSumDGPU(d_Data, d_Data,datanum, v2);	
v1=1.0f / sqrt(bufnum*bufnum*v1*v2);

//pad data
cutilSafeCall( cudaMalloc((void **)&d_PaddedData,     bufsize ));
cutilSafeCall( cudaMemset(d_PaddedData,  0, bufsize));
//pad data at down and right direction with 0
CudaPadArray0BorderDnRt(d_PaddedData,d_Data,fftH,fftW, dataH,dataW, fftH-dataH, fftW-dataW);
cutilSafeCall( cudaThreadSynchronize() );		
//free memory
cutilSafeCall( cudaFree(d_Data) );



cutilSafeCall( cudaMalloc((void **)&d_PaddedKernel,   bufsize ));
cutilSafeCall( cudaMemset(d_PaddedKernel, 0, bufsize));
CudaPadArray0BorderDnRt(d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, fftH-kernelH, fftW-kernelW);	
cutilSafeCall( cudaThreadSynchronize() );	
cutilSafeCall( cudaFree(d_Kernel) );		


cutilSafeCall( cudaMalloc((void **)&d_KernelSpectrum,(bufnum  + fftH) * sizeof(fComplex)));
cufftSafeCall( cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C));		

cufftSafeCall( cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum) );    
cutilSafeCall( cudaThreadSynchronize() );
cutilSafeCall( cudaFree(d_PaddedKernel));


cutilSafeCall( cudaMalloc((void **)&d_DataSpectrum,  (bufnum  + fftH) * sizeof(fComplex)));
cufftSafeCall( cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedData, (cufftComplex *)d_DataSpectrum) );
cutilSafeCall( cudaThreadSynchronize() );

cufftSafeCall( cufftDestroy(fftPlanFwd));

CudaModulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftH, fftW, 1,v1);
cutilSafeCall( cudaThreadSynchronize() );
cutilSafeCall( cudaFree(d_KernelSpectrum) );	

cufftSafeCall( cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R));
cufftSafeCall( cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, (cufftReal *)d_PaddedData) );

cutilSafeCall( cudaThreadSynchronize());
cutilSafeCall( cudaFree(d_DataSpectrum));	
cufftSafeCall( cufftDestroy(fftPlanInv));

cutilSafeCall( cudaMemcpy(*Results, d_PaddedData, bufsize, cudaMemcpyDeviceToHost));	


cutilSafeCall( cudaFree(d_PaddedData));				

cudaThreadExit();

}

I find the error, it is overflow when I calculate the Sq Sum.

I find the error, it is overflow when I calculate the Sq Sum.