Hello, world!
Merry Christmas!
I have some problems with the convolution, based on cufft. I used the sample code from cuda (cuda/samples/3_Imaging/convolutionFFT2D) as a base and changed it a little bit - just to see the difference with FFTW routines. I have never used CUDA before.
However, I had strange (from my point of view) results. I present the code below. If somebody of you had a deal with this sample, please, help me to understand, what’s going on… :)
Convolution function (changed sample)
double *cuda_convolution_2D(double *h_Data, double *h_Kernel,
int dataW, int dataH,
int kernelW, int kernelH,
int kx, int ky)
{
float *h_ResultGPU;
float *d_Data, *d_PaddedData, *d_Kernel, *d_PaddedKernel;
fComplex *d_DataSpectrum, *d_KernelSpectrum;
cufftHandle fftPlanFwd, fftPlanInv;
bool bRetVal;
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
printf("Testing built-in R2C / C2R FFT-based convolution\n");
const int kernelY = kx;
const int kernelX = ky;
const int fftH = snapTransformSize(dataH + kernelH - 1);
const int fftW = snapTransformSize(dataW + kernelW - 1);
printf("...allocating memory\n");
h_ResultGPU = (float *)malloc(fftH * fftW * sizeof(float));
checkCudaErrors(cudaMalloc((void **)&d_Data, dataH * dataW * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_PaddedData, fftH * fftW * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex)));
checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex)));
printf("...creating R2C & C2R FFT plans for %i x %i\n", fftH, fftW);
checkCudaErrors(cufftPlan2d(&fftPlanFwd, fftW, fftH, CUFFT_R2C));
checkCudaErrors(cufftPlan2d(&fftPlanInv, fftW, fftH, CUFFT_C2R));
printf("...uploading to GPU and padding convolution kernel and input data\n");
checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float)));
checkCudaErrors(cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float)));
padKernel(
d_PaddedKernel,
d_Kernel,
fftH,
fftW,
kernelH,
kernelW,
kernelY,
kernelX
);
padDataClampToBorder(
d_PaddedData,
d_Data,
fftH,
fftW,
dataH,
dataW,
kernelH,
kernelW,
kernelY,
kernelX
);
//Not including kernel transformation into time measurement,
//since convolution kernel is not changed very frequently
printf("...transforming convolution kernel\n");
checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum));
printf("...running GPU FFT convolution: ");
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedData, (cufftComplex *)d_DataSpectrum));
modulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftW, fftH, 1);
checkCudaErrors(cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, (cufftReal *)d_PaddedData));
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&hTimer);
double gpuTime = sdkGetTimerValue(&hTimer);
printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime);
printf("...reading back GPU convolution results\n");
checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost));
double sum_delta2 = 0;
double sum_ref2 = 0;
double max_delta_ref = 0;
double *res = new double[fftW*fftH];
for (int y = 0; y < fftH; y++) {
for (int x = 0; x < fftW; x++) {
double rGPU = (double)h_ResultGPU[y * fftW + x];
res[fftW*y + x] = rGPU;
}
}
printf("...shutting down\n");
sdkDeleteTimer(&hTimer);
checkCudaErrors(cufftDestroy(fftPlanInv));
checkCudaErrors(cufftDestroy(fftPlanFwd));
checkCudaErrors(cudaFree(d_DataSpectrum));
checkCudaErrors(cudaFree(d_KernelSpectrum));
checkCudaErrors(cudaFree(d_PaddedData));
checkCudaErrors(cudaFree(d_PaddedKernel));
checkCudaErrors(cudaFree(d_Data));
checkCudaErrors(cudaFree(d_Kernel));
free(h_ResultGPU);
return res;
}
And main function is:
int N=1024, M=1024;
int r_sq = 10.0;
int r_kern = N/10;
double *data = new double[N*M];
int Nk = 1024, Mk = 1024;
double *kernel = new double[Nk*Mk];
// Initializing sample array
for(int i = 0; i<N; i++) {
for(int j=0; j<M; j++) {
if((i>2*N/3-r_sq && i<2*N/3+r_sq && j<M/2+r_sq && j>M/2-r_sq) ||
(i>N/3-r_sq && i<N/3+r_sq && j<M/2+r_sq && j>M/2-r_sq) ||
(i>N/3-r_sq && i<N/3+r_sq && j<M/3+r_sq && j>M/3-r_sq) ){
data[j*N+i] = 1.0;
} else {
data[j*N+i] = 0.0;
}
}
}
//Initializing blurring kernel
for(int i = 0; i<Nk; i++) {
for(int j=0; j<Mk; j++) {
if((i-Nk/2)*(i-Nk/2) + (j-Mk/2)*(j-Mk/2) < r_kern*r_kern) {
// if(i*i + j*j < r_kern*r_kern) {
kernel[j*Nk+i] = 1.0;
} else {
kernel[j*Nk+i] = 0.0;
}
}
}
double *res = cuda_convolution_2D(data, kernel, N, M, Nk, Mk, Nk/2, Mk/2);
This is data (1024 X 1024):
http://postimg.org/image/dvs9yp99d/
Kernel (1024 X 1024):
http://postimg.org/image/afgl6k4xl/
And the result… s\
http://postimg.org/image/jm3h7kzjt/
Instead of assumed (obtained using FFTW):
http://postimg.org/image/y32rwk1l1/
Since I’m not a professional programmer, I assume that I don’t understand something important about the process of convolution with cuda r2c and c2r transforms and padding of kernel or data. So, I will be grateful for any explanations :) Thank you!