Hi.
I’m trying to replicate the convolutionFFT2D of the nvidia gpu computing sdk, but the convolution operation is giving me some strange results. Can anyone see anything strange in the code?
The input values are all ‘1’.
The output of the convolution is ‘nan’.
Here is the code:
inline __device__ void mulAndScale(double2& a, const double2& b,
const double& c)
{
double2 t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
a = t;
}
__global__ void modulateAndNormalize_kernel( double2 *d_Dst,
double2 *d_Src,
int dataSize,
double c)
{
const int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i >= dataSize)
return;
double2 a = d_Src[i];
double2 b = d_Dst[i];
mulAndScale(a, b, c);
d_Dst[i] = a;
}
//Round a / b to nearest higher integer value
inline int iDivUp(int a, int b){
return (a % b != 0) ? (a / b + 1) : (a / b);
}
void modulateAndNormalize(
double2 *d_Dst,
double2 *d_Src,
int fftH,
int fftW,
int padding
){
assert( fftW % 2 == 0 );
const int dataSize = fftH * (fftW / 2 + padding);
modulateAndNormalize_kernel<<<iDivUp(dataSize, 256), 256>>>(
d_Dst,
d_Src,
dataSize,
1.0f / (double)(fftW * fftH)
);
//cutilCheckMsg("modulateAndNormalize() execution failed\n");
}
void k_CONV(double **data_pointers, int *args){
int N = args[0];
// create cufft plan
cufftHandle fftPlanFwdX, fftPlanFwdY ,fftPlanInv;
cufftPlan2d(&fftPlanFwdX, N, N, CUFFT_D2Z);
cufftPlan2d(&fftPlanFwdY, N, N, CUFFT_D2Z);
cufftPlan2d(&fftPlanInv, N, N, CUFFT_Z2D);
// allocate space on device (input x2, spectum x2)
printf("Allocating data on device\n");
// input data
double *d_InputSignalX;
double *d_InputSignalY;
cudaMalloc((void **)&d_InputSignalX, N * N * sizeof(double));
cudaMalloc((void **)&d_InputSignalY, N * N * sizeof(double));
// signal spectrum data
double2 *spectrumX,
*spectrumY;
cudaMalloc((void **)&spectrumX, N * (N/2 + 1) * sizeof(double2));
cudaMalloc((void **)&spectrumY, N * (N/2 + 1) * sizeof(double2));
// copy data to device
printf("Copying data to device\n");
cudaMemcpy(d_InputSignalX, data_pointers[0], N * sizeof(double), cudaMemcpyHostToDevice );
cudaMemcpy(d_InputSignalY, data_pointers[1], N * sizeof(double), cudaMemcpyHostToDevice );
printf("Forward fft\n");
// call r2c cufft
cufftExecD2Z(fftPlanFwdX, (cufftDoubleReal *)d_InputSignalX, (cufftDoubleComplex *)spectrumX);
cufftExecD2Z(fftPlanFwdY, (cufftDoubleReal *)d_InputSignalY, (cufftDoubleComplex *)spectrumY);
// call modulateAndNormalize
printf("Modulate and normalize\n");
modulateAndNormalize(spectrumX, spectrumY, N, N, 1);
// call c2r
printf("Inverse fft\n");
cufftExecZ2D(fftPlanInv, (cufftDoubleComplex *)spectrumX, (cufftDoubleReal *)d_InputSignalX);
// copy data back to host
printf("Copying result to host\n");
cudaMemcpy(data_pointers[0], d_InputSignalX, N * N * sizeof(double), cudaMemcpyDeviceToHost);
printf("Cleaning plans\n");
// destroy plans
cufftDestroy(fftPlanFwdX);
cufftDestroy(fftPlanFwdY);
cufftDestroy(fftPlanInv);
printf("Cleaning memory\n");
// free allocated memory
cudaFree(spectrumX);
cudaFree(spectrumY);
cudaFree(d_InputSignalX);
cudaFree(d_InputSignalY);
}
Thanks.