Problem with cufft

I encounter several problems when I use cufft. Below is my code:

-----------------------CODE BEGIN----------------------------------------------

#include <stdio.h>
#include <math.h>
#include <stdint.h>
#include <cufft.h>
#include <cutil.h>

//============================

#define MallocD(pointer, size)
do{CUDA_SAFE_CALL( cudaMalloc((void**)&pointer, size));} while(0)
#define FreeD(pointer)
do{CUDA_SAFE_CALL( cudaFree(pointer));}while(0)
//-----------------------------------------------------

#define MemcpyH2D(d_dst, h_src, size)
do{CUDA_SAFE_CALL(cudaMemcpy((void*)d_dst, (void*)h_src, size,cudaMemcpyHostToDevice));}while(0)
#define MemcpyD2H(h_dst, d_src, size)
do{CUDA_SAFE_CALL(cudaMemcpy((void*)h_dst, (void*)d_src, size,cudaMemcpyDeviceToHost));}while(0)

//============================

const uint32_t N=8192;
const uint32_t NBATCH=256;

//============================

void
gpufft_test()
{

CUT_DEVICE_INIT(1, 0);

float *d_idata;
float *h_idata;

MallocD(d_idata, NNBATCHsizeof(float));
h_idata=(float*)malloc(NNBATCHsizeof(float));

//===================================================

for(int k=0; k<NBATCH; ++k)
for(int i=0; i<N; ++i)
{
h_idata[k*N+i]=sqrtf((float)i)*0.0001f;
}

MemcpyH2D(d_idata, h_idata, NNBATCHsizeof(float));

//====================================================

cufftHandle plan;
cufftPlan1d(&plan, N/2, CUFFT_C2C, NBATCH);

//=====================================================

cufftExecC2C(plan, (cufftComplex*)d_idata,
(cufftComplex*)d_idata, CUFFT_INVERSE);

//=====================================================

MemcpyD2H(h_idata, d_idata, NNBATCHsizeof(float));

for(int k=0; k<NBATCH; ++k)
{
float h_k=h_idata+kN;

for(int i=0; i<N; ++i)
if(h_k[i]!=h_idata[i])
{
printf("#%d: h_idata(%d) = %f, h_k(%d)=%f\n",
k, i, h_idata[i],i, h_k[i]);
break;
}
}

FreeD(d_idata);
free(h_idata);

cufftDestroy(plan);

return;

}

//===============================

int
main()
{
gpufft_test();

}

--------------------CODE END----------------------------

Environment Details:
Linux 2.6.18 x86_64bit.
toolkit v2.3
SDK v2.3
Card M1060

When NBATCH is less than 32, no error after running many times; when NBATCH becomes bigger, .e.g., 128, error occurs after running a few times !

Any suggestions are appreciated.

Thank you!