cudaMemset bug?

I noticed that in Release mode:
CUDA_SAFE_CALL( cudaMemset(d_PaddedKernel, 0, FFT_SIZE) );
fills d_PaddedKernel with zeros, but the values in the last row are random. In Emulation mode, the output image is correctly filled with zeros. Any hint?

d_PaddedKernel is defined like this:
typedef float2 Complex;
Complex *d_PaddedKernel;
CUDA_SAFE_CALL( cudaMalloc((void **)&d_PaddedKernel, FFT_SIZE) );

FFT_SIZE is the allocated size in bytes:
FFT_SIZE = FFT_W * FFT_H * sizeof(Complex);

Here’s the code to write the image:
Complex h_PaddedKernel = (Complex)malloc(FFT_SIZE);
float h_PaddedKernel_float = (float)malloc(FFT_SIZE/2);

CUDA_SAFE_CALL( cudaMemcpy(h_PaddedKernel, d_PaddedKernel, FFT_SIZE, cudaMemcpyDeviceToHost ) );

for(int i = 0; i < (FFT_W * FFT_H); i++)
h_PaddedKernel_float[i] = h_PaddedKernel[i].x;

char *kernel_filename = “C:/padded_kernel.pgm”;

CUT_SAFE_CALL( cutSavePGMf( kernel_filename, h_PaddedKernel_float, FFT_W, FFT_H));