convolutionFFT2D using lena_bw.pgm works in EmuDebug, but not in Release

I tried to change the SDK example convolutionFFT2D to low pass filter lena_bw.pgm.
In EmuDebug, it prints ‘Test passed’ and the output image is ok (blurred). But in Debug or Release it still says ‘Test passed’ but I get a rubish output image. Do I need to perform some scaling of the image? or is there other issues? Here is my code (I removed the CPU processing):

/*

  • Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
  • NOTICE TO USER:
  • This source code is subject to NVIDIA ownership rights under U.S. and
  • international Copyright laws. Users and possessors of this source code
  • are hereby granted a nonexclusive, royalty-free license to use this code
  • in individual and commercial software.
  • NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
  • CODE FOR ANY PURPOSE. IT IS PROVIDED “AS IS” WITHOUT EXPRESS OR
  • IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
  • REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
  • MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
  • IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
  • OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
  • OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  • OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
  • OR PERFORMANCE OF THIS SOURCE CODE.
  • U.S. Government End Users. This source code is a “commercial item” as
  • that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
  • “commercial computer software” and "commercial computer software
  • documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
  • and is provided to the U.S. Government only as a commercial end item.
  • Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
  • 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
  • source code with only those rights set forth herein.
  • Any use of this source code in individual and commercial software must
  • include, in the user documentation and internal comments to the code,
  • the above Disclaimer and U.S. Government End Users Notice.
    */

/*

  • This sample demonstrates how 2D convolutions
  • with very large kernel sizes
  • can be efficiently implemented
  • using FFT transformations.
    */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cufft.h>
#include <cutil.h>

char *image_filename = “lena_small.pgm”;
unsigned int width, height;

////////////////////////////////////////////////////////////////////////////////
// Complex numbers manipulations
////////////////////////////////////////////////////////////////////////////////
typedef float2 Complex;

extern “C” host device Complex complexAdd(Complex a, Complex B){
Complex t = {a.x + b.x, a.y + b.y};
return t;
}

extern “C” host device Complex complexMul(Complex a, Complex B){
Complex t = {a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
return t;
}

extern “C” host device Complex complexScale(Complex a, float c){
Complex t = {a.x * c, a.y * c};
return t;
}

////////////////////////////////////////////////////////////////////////////////
// Common host and device functions
////////////////////////////////////////////////////////////////////////////////
//Round a / b to nearest higher integer value
int iDivUp(int a, int B){
return (a % b != 0) ? (a / b + 1) : (a / B);
}

//Align a to nearest higher multiple of b
int iAlignUp(int a, int B){
return (a % b != 0) ? (a - a % b + B) : a;
}

////////////////////////////////////////////////////////////////////////////////
// Padding kernels
////////////////////////////////////////////////////////////////////////////////
#include “convolutionFFT2D_kernel.cu”

////////////////////////////////////////////////////////////////////////////////
// Data configuration
////////////////////////////////////////////////////////////////////////////////
int calculateFFTsize(int dataSize){
//Highest non-zero bit position of dataSize
int hiBit;
//Neares lower and higher powers of two numbers for dataSize
unsigned int lowPOT, hiPOT;

//Align data size to a multiple of half-warp
//in order to have each line starting at properly aligned addresses
//for coalesced global memory writes in padKernel() and padData()
dataSize = iAlignUp(dataSize, 16);

//Find highest non-zero bit
for(hiBit = 31; hiBit >= 0; hiBit--)
    if(dataSize & (1U << hiBit)) break;

//No need to align, if already power of two
lowPOT = 1U << hiBit;
if(lowPOT == dataSize) return dataSize;

//Align to a nearest higher power of two, if the size is small enough,
//else align only to a nearest higher multiple of 512,
//in order to save computation and memory bandwidth
hiPOT = 1U << (hiBit + 1);
if(hiPOT <= 1024)
    return hiPOT;
else 
    return iAlignUp(dataSize, 512);

}

//Kernel dimensions
const int KERNEL_W = 7;
const int KERNEL_H = 7;

//Kernel center position
const int KERNEL_X = 1;
const int KERNEL_Y = 6;

//Width and height of padding for “clamp to border” addressing mode
const int PADDING_W = KERNEL_W - 1;
const int PADDING_H = KERNEL_H - 1;

int DATA_W;
int DATA_H;

//Derive FFT size from data and kernel dimensions
int FFT_W;
int FFT_H;

int FFT_SIZE;
int KERNEL_SIZE;
int DATA_SIZE;

float* h_img = NULL;
float* h_odata = NULL;

Complex *h_Kernel,
*h_Data,
*h_ResultCPU,
*h_ResultGPU;

cudaArray
*a_Kernel,
*a_Data;
Complex
*d_PaddedKernel,
*d_PaddedData;

cufftHandle FFTplan;

void init_cuda()
{
//Derive FFT size from data and kernel dimensions
FFT_W = calculateFFTsize(DATA_W + PADDING_W);
FFT_H = calculateFFTsize(DATA_H + PADDING_H);

FFT_SIZE = FFT_W    * FFT_H    * sizeof(Complex);
KERNEL_SIZE = KERNEL_W * KERNEL_H * sizeof(Complex);
DATA_SIZE = DATA_W   * DATA_H   * sizeof(Complex);

printf("Input data size           : %i x %i\n", DATA_W,             DATA_H            );
printf("Convolution kernel size   : %i x %i\n", KERNEL_W,           KERNEL_H          );
printf("Padded image size         : %i x %i\n", DATA_W + PADDING_W, DATA_H + PADDING_H);
printf("Aligned padded image size : %i x %i\n", FFT_W,              FFT_H             );

printf("Allocating memory...\n");
h_Kernel       = (Complex *)malloc(KERNEL_SIZE);
h_Data         = (Complex *)malloc(DATA_SIZE);
h_ResultCPU    = (Complex *)malloc(DATA_SIZE);
h_ResultGPU    = (Complex *)malloc(FFT_SIZE);

cudaChannelFormatDesc float2tex = cudaCreateChannelDesc<float2>();

CUDA_SAFE_CALL( cudaMallocArray(&a_Kernel, &float2tex, KERNEL_W, KERNEL_H) );
CUDA_SAFE_CALL( cudaMallocArray(&a_Data,   &float2tex,   DATA_W,   DATA_H) );
CUDA_SAFE_CALL( cudaMalloc((void **)&d_PaddedKernel, FFT_SIZE) );
CUDA_SAFE_CALL( cudaMalloc((void **)&d_PaddedData,   FFT_SIZE) );

}

void cleanup()
{
printf(“Shutting down…\n”);
CUDA_SAFE_CALL( cudaUnbindTexture(texData) );
CUDA_SAFE_CALL( cudaUnbindTexture(texKernel) );
CUFFT_SAFE_CALL( cufftDestroy(FFTplan) );
CUDA_SAFE_CALL( cudaFree(d_PaddedData) );
CUDA_SAFE_CALL( cudaFree(d_PaddedKernel) );
CUDA_SAFE_CALL( cudaFreeArray(a_Data) );
CUDA_SAFE_CALL( cudaFreeArray(a_Kernel) );
free(h_ResultGPU);
free(h_Data);
free(h_Kernel);
free(h_img); // error?
free(h_odata);
}

////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
CUT_DEVICE_INIT();

char* image_path = "C:/Program Files/NVIDIA Corporation/NVIDIA CUDA SDK/projects/myConvolutionFFT2D/data/lena_200.pgm";
	//cutFindFilePath(image_filename, argv[0]);
if (image_path == 0) {
    fprintf(stderr, "Error finding image file '%s'\n", image_filename);
    exit(EXIT_FAILURE);
}

CUT_SAFE_CALL( cutLoadPGMf(image_path, &h_img, &width, &height));
if (!h_img) {
    printf("Error opening file '%s'\n", image_path);
    exit(-1);
}
printf("Loaded '%s', %d x %d pixels\n", image_path, width, height);

DATA_W = width;
DATA_H = height;

init_cuda();

printf("Generating random input data...\n");
    srand((int)time(NULL));
    for(int i = 0; i < (KERNEL_W * KERNEL_H); i++){
        h_Kernel[i].x = 1.0 / (KERNEL_W * KERNEL_H); //(float)rand() / (float)RAND_MAX;
        h_Kernel[i].y = 0;
    }

    for(int i = 0; i < (DATA_W * DATA_H); i++){
        h_Data[i].x = h_img[i];//(float)rand() / (float)RAND_MAX;
        h_Data[i].y = 0;
    }

printf("Creating FFT plan for %i x %i...\n", FFT_W, FFT_H);
    CUFFT_SAFE_CALL( cufftPlan2d(&FFTplan, FFT_H, FFT_W, CUFFT_C2C) );

printf("Uploading to GPU and padding convolution kernel and input data...\n");
    printf("...initializing padded kernel and data storage with zeroes...\n");
    CUDA_SAFE_CALL( cudaMemset(d_PaddedKernel, 0, FFT_SIZE) );
    CUDA_SAFE_CALL( cudaMemset(d_PaddedData,   0, FFT_SIZE) );
    printf("...copying input data and convolution kernel from host to CUDA arrays\n");
    CUDA_SAFE_CALL( cudaMemcpyToArray(a_Kernel, 0, 0, h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice) );
    CUDA_SAFE_CALL( cudaMemcpyToArray(a_Data,   0, 0, h_Data,   DATA_SIZE,   cudaMemcpyHostToDevice) );
    printf("...binding CUDA arrays to texture references\n");
    CUDA_SAFE_CALL( cudaBindTextureToArray(texKernel, a_Kernel) );
    CUDA_SAFE_CALL( cudaBindTextureToArray(texData,   a_Data)   );

    //Block width should be a multiple of maximum coalesced write size 
    //for coalesced memory writes in padKernel() and padData()
    dim3 threadBlock(16, 12);
    dim3 kernelBlockGrid(iDivUp(KERNEL_W, threadBlock.x), iDivUp(KERNEL_H, threadBlock.y));
    dim3 dataBlockGrid(iDivUp(FFT_W, threadBlock.x), iDivUp(FFT_H, threadBlock.y));

    printf("...padding convolution kernel\n");
    padKernel<<<kernelBlockGrid, threadBlock>>>(
        d_PaddedKernel,
        FFT_W,
        FFT_H,
        KERNEL_W,
        KERNEL_H,
        KERNEL_X,
        KERNEL_Y
    );
    CUT_CHECK_ERROR("padKernel() execution failed\n");

    printf("...padding input data array\n");
    padData<<<dataBlockGrid, threadBlock>>>(
        d_PaddedData,
        FFT_W,
        FFT_H,
        DATA_W,
        DATA_H,
        KERNEL_W,
        KERNEL_H,
        KERNEL_X,
        KERNEL_Y
    );
    CUT_CHECK_ERROR("padData() execution failed\n");

//Not including kernel transformation into time measurement,
//since convolution kernel is not changed very frequently
printf("Transforming convolution kernel...\n");
    CUFFT_SAFE_CALL( cufftExecC2C(FFTplan, (cufftComplex *)d_PaddedKernel, (cufftComplex *)d_PaddedKernel, CUFFT_FORWARD) );

printf("Running GPU FFT convolution...\n");
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUFFT_SAFE_CALL( cufftExecC2C(FFTplan, (cufftComplex *)d_PaddedData,   (cufftComplex *)d_PaddedData,   CUFFT_FORWARD) );
    
modulateAndNormalize<<<16, 128>>>( d_PaddedData, d_PaddedKernel, FFT_W * FFT_H );

CUT_CHECK_ERROR("modulateAndNormalize() execution failed\n");
CUFFT_SAFE_CALL( cufftExecC2C(FFTplan, (cufftComplex *)d_PaddedData,   (cufftComplex *)d_PaddedData,   CUFFT_INVERSE) );
CUDA_SAFE_CALL( cudaThreadSynchronize() );

printf("Reading back GPU FFT results...\n");
CUDA_SAFE_CALL( cudaMemcpy(h_ResultGPU, d_PaddedData, FFT_SIZE, cudaMemcpyDeviceToHost) );

printf("allocating host data...\n");
h_odata = (float*) malloc(DATA_H*DATA_W*sizeof(float));

printf("copying host data...\n");
for(int j=0; j < DATA_H; j++)
  {
  for(int i = 0; i < DATA_W; i++)
    {
    h_odata[j*DATA_W+i] = h_ResultGPU[j*FFT_W+i].x;
    }
}

printf("host data copied...\n");

// write result to file
//char output_filename[1024];
//strcpy(output_filename, image_path);
//strcpy(output_filename + strlen(image_path) - 4, "_out.pgm");
char *output_filename = "C:/Program Files/NVIDIA Corporation/NVIDIA CUDA SDK/projects/myConvolutionFFT2D/data/lena_200_out.pgm";
CUT_SAFE_CALL( cutSavePGMf( output_filename, h_odata, width, height));
printf("Wrote '%s'\n", output_filename);

cleanup();

CUT_EXIT(argc, argv);

}

I noticed that:
CUDA_SAFE_CALL( cudaMemset(d_PaddedKernel, 0, FFT_SIZE) );
creates an image filled with zero in Emulation mode.
But in Release, the last row has rubbish values.
d_PaddedData is correctly filled with zero either in Emulation or Release.

Here’s the code to write the image:
Complex h_PaddedKernel = (Complex)malloc(FFT_SIZE);
float h_PaddedKernel_float = (float)malloc(FFT_SIZE/2);

CUDA_SAFE_CALL( cudaMemcpy(h_PaddedKernel, d_PaddedKernel, FFT_SIZE, cudaMemcpyDeviceToHost ) );

for(int i = 0; i < (FFT_W * FFT_H); i++)
{
    h_PaddedKernel_float[i] = h_PaddedKernel[i].x;
    }

char *kernel_filename = "C:/Program Files/NVIDIA Corporation/NVIDIA CUDA SDK/projects/myConvolutionFFT2D/data/padded_kernel.pgm";

CUT_SAFE_CALL( cutSavePGMf( kernel_filename, h_PaddedKernel_float, FFT_W, FFT_H));

free(h_PaddedKernel);
free(h_PaddedKernel_float);

Does it look correct after “enforcing” the Memset in one or another way (f.e. when clearing the buffer on host and copying it to GPU)? How big is the reported L2-norm (Emu and Release)? What CUDA toolkit / display driver versions are used?

I actually hadn’t tested the results visually, just always validate by L2-norm, which lies around 1e-7…2e-7 – an expected difference, especially given that CPU and GPU follow different computation paths.