Hello,
I am using some code that I have copied directly from the SDK for doing convolution.
I am getting a Warp Out-of-range Address sometimes in this kernel:
__global__ void modulateAndNormalize_kernel(
fComplex *d_Dst,
fComplex *d_Src,
int dataSize,
float c
){
const int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i >= dataSize)
return;
fComplex a = d_Src[i];
fComplex b = d_Dst[i]; //the line showing the Error
mulAndScale(a, b, c);
d_Dst[i] = a;
}
The code leading up to this kernel call is as follows:
chipH and chipW are both 1024. kSize is 3.
cutilSafeCall( cudaMalloc((void **)&d_Data, chipH * chipW * sizeof(float)) );
cutilSafeCall( cudaMalloc((void **)&d_Kernel, kSize * kSize * sizeof(float)) );
cutilSafeCall( cudaMalloc((void **)&d_PaddedData, fftH * fftW * sizeof(float)) );
cutilSafeCall( cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float)) );
cutilSafeCall( cudaMalloc((void **)&d_DataSpectrum, fftH * (fftW / 2 + 1) * sizeof(cufftComplex)) );
cutilSafeCall( cudaMalloc((void **)&d_KernelSpectrum, fftH * (fftW / 2 + 1) * sizeof(cufftComplex)) );
//PLAN
cufftSafeCall( cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C) );
cufftSafeCall( cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R) );
cutilSafeCall( cudaMemcpy(d_Kernel, hp_kernel, kSize * kSize * sizeof(float), cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_Data, float_h_data, chipH * chipW * sizeof(float), cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float)) );
cutilSafeCall( cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float)) );
padDataClampToBorder( d_PaddedData, d_Data, fftH, fftW, chipH, chipW, kSize, kSize, kernelY, kernelX);
padKernel(d_PaddedKernel, d_Kernel, fftH, fftW, kSize, kSize, kernelY, kernelX);
cufftSafeCall( cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum) );
cufftSafeCall( cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedData, (cufftComplex *)d_DataSpectrum) );
//multiplies the kernel and data spectrum, as well as normalizes the data
modulateAndNormalize((fComplex *)d_DataSpectrum, (fComplex *)d_KernelSpectrum, fftH, fftW, 1);
and the modulateAndNormalize wrapper is defined as:
extern "C" void modulateAndNormalize(
fComplex *d_Dst,
fComplex *d_Src,
int fftH,
int fftW,
int padding
){
assert( fftW % 2 == 0 );
const int dataSize = fftH * (fftW / 2 + padding);
modulateAndNormalize_kernel<<<iDivUp(dataSize, 256), 256>>>(
d_Dst,
d_Src,
dataSize,
1.0f / (float)(fftW * fftH)
);
cutilCheckMsg("modulateAndNormalize() execution failed\n");
}
Any help tracking down how this warp error is occuring is much appreciated. The value for i in the line fComplex b = d_Dst[i]; is 90272, and datasize is 1181184, so it shouldnt be a problem?
Thanks