Warp Out-of-range Address

Hello,

I am using some code that I have copied directly from the SDK for doing convolution.

I am getting a Warp Out-of-range Address sometimes in this kernel:

__global__ void modulateAndNormalize_kernel(

     fComplex *d_Dst,

     fComplex *d_Src,

     int dataSize,

     float c

 ){

     const int i = blockDim.x * blockIdx.x + threadIdx.x;

     if(i >= dataSize)

         return;

fComplex a = d_Src[i];

     fComplex b = d_Dst[i]; //the line showing the Error

mulAndScale(a, b, c);

d_Dst[i] = a;

 }

The code leading up to this kernel call is as follows:

chipH and chipW are both 1024. kSize is 3.

cutilSafeCall( cudaMalloc((void **)&d_Data,   chipH   * chipW   * sizeof(float)) );

        cutilSafeCall( cudaMalloc((void **)&d_Kernel, kSize * kSize * sizeof(float)) );

        cutilSafeCall( cudaMalloc((void **)&d_PaddedData,   fftH * fftW * sizeof(float)) );

        cutilSafeCall( cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float)) );

        cutilSafeCall( cudaMalloc((void **)&d_DataSpectrum,   fftH * (fftW / 2 + 1) * sizeof(cufftComplex)) );

        cutilSafeCall( cudaMalloc((void **)&d_KernelSpectrum, fftH * (fftW / 2 + 1) * sizeof(cufftComplex)) );

	//PLAN

cufftSafeCall( cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C) );

        cufftSafeCall( cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R) );

cutilSafeCall( cudaMemcpy(d_Kernel, hp_kernel, kSize * kSize * sizeof(float), cudaMemcpyHostToDevice) );

        cutilSafeCall( cudaMemcpy(d_Data,   float_h_data,   chipH   * chipW *   sizeof(float), cudaMemcpyHostToDevice) );

        cutilSafeCall( cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float)) );

        cutilSafeCall( cudaMemset(d_PaddedData,   0, fftH * fftW * sizeof(float)) );

padDataClampToBorder( d_PaddedData, d_Data, fftH, fftW, chipH, chipW, kSize, kSize, kernelY, kernelX);

        padKernel(d_PaddedKernel, d_Kernel, fftH, fftW, kSize, kSize, kernelY, kernelX);

cufftSafeCall( cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum) );

        cufftSafeCall( cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedData, (cufftComplex *)d_DataSpectrum) );

	//multiplies the kernel and data spectrum, as well as normalizes the data

	modulateAndNormalize((fComplex *)d_DataSpectrum, (fComplex *)d_KernelSpectrum, fftH, fftW, 1);

and the modulateAndNormalize wrapper is defined as:

extern "C" void modulateAndNormalize(

    fComplex *d_Dst,

    fComplex *d_Src,

    int fftH,

    int fftW,

    int padding

){

    assert( fftW % 2 == 0 );

    const int dataSize = fftH * (fftW / 2 + padding);

modulateAndNormalize_kernel<<<iDivUp(dataSize, 256), 256>>>(

        d_Dst,

        d_Src,

        dataSize,

        1.0f / (float)(fftW * fftH)

    );

    cutilCheckMsg("modulateAndNormalize() execution failed\n");

}

Any help tracking down how this warp error is occuring is much appreciated. The value for i in the line fComplex b = d_Dst[i]; is 90272, and datasize is 1181184, so it shouldnt be a problem?

Thanks