First chance Exceptipn at cuda memory

I’m beginner of Cuda coding. I am being stacked by cudaError_enum at memory location xxxxx;

I deveoped a global cuda functionin, and try to called it in a loop (0=< k <= N), and transfter data out from GPU to CPU.

this function works when k<15. and I can transfter data out from GPU to CPU using cudaMemcpy. When K goes to 15, the cuda function still works. While, cudaMemcpy function does not work, it gives out the error:

“First-chance exception at 0x000007fefd34aa7d in cudaCubicRotate2D.exe: Microsoft C++ exception: cudaError_enum at memory location 0x0012f9a0.”

Could anyone please help me ? I has been stopped by this for couple months. Any help will be highly appreicated.

[codebox]

main code.

const dim3 blockSize(16, 16);

const dim3 gridSize(imageSize.x / blockSize.x, imageSize.y / blockSize.y);

for(k = 0; k < 100; k++){

  warp_kernel<<<gridSize, blockSize>>>(output, voxel, coordX, coordY, coordZ, imageSize, k, threshold);

  cudaMemcpy(OutImage, output, nrOfBytes, cudaMemcpyDeviceToHost);

  for(j = 0; j < imageSize.y; j++){

         for(i = 0; i < imageSize.x; i++){

   OutputImage[k * slice + j * imageSize.x +i] = OutImage[j * imageSize.x + i];

          }

  }

}

function

///// Warp the refernece image into different phase ///////////

global void

warp_kernel(float* output, float* voxel, float* coordX, float* coordY, float* coordZ, uint3 imageSize, uint k, int threshold)

{

//long k = 0;

uint i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;

uint j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;

uint tt = __umul24(j, imageSize.x) + i;

long  nx, ny, nz;

float temp, temp1, temp2, temp3, temp0;

float *p0, *px, *py, *pz, *px0, *py0, *pz0;

long  ix, iy, iz, ixL, ixU, iyL, iyU, izL, izU;   

long  slice = imageSize.x * imageSize.y;

///////////////  warp the first slice  ////////////////////

ixL = max(0, i - threshold);  

ixU = min (imageSize.x - 1, i + threshold);		

iyL = max(0, j - threshold); 

iyU = min (imageSize.y - 1, j + threshold);

izL = max(0, k - threshold);

izU = min (imageSize.z - 1, k + threshold);

ix  = (long)i; 

iy  = (long)j;

iz  = (long)k;

temp1 = coordX[iz * slice + iy * imageSize.x + ix];

temp2 = coordY[iz * slice + iy * imageSize.x + ix];

temp3 = coordZ[iz * slice + iy * imageSize.x + ix]; 

temp = (temp1 - (float)i) * (temp1 - (float)i) + (temp2 - (float)j) * (temp2 - (float)j) + (temp3 - (float)k) * (temp3 - (float)k);		



for (nz = izL; nz <= izU; nz++){	

      px = coordX + (ptrdiff_t)(nz * slice);

      py = coordY + (ptrdiff_t)(nz * slice);

      pz = coordZ + (ptrdiff_t)(nz * slice);

      for (ny = iyL; ny <= iyU; ny++){	

px0 = px + (ptrdiff_t)(ny * imageSize.x);

py0 = py + (ptrdiff_t)(ny * imageSize.x);

pz0 = pz + (ptrdiff_t)(ny * imageSize.x);	  

for (nx = ixL; nx <= ixU; nx++){

    float temp11 = px0[nx];

    float temp22 = py0[nx];

    float temp33 = pz0[nx]; 

    temp0 = (temp11 - i) * (temp11 - i) + (temp22 - j) * (temp22 - j) + (temp33 - k) * (temp33 - k);		

    if( temp0 < temp ){

         ix    = nx;

         iy    = ny;

         iz    = nz;

         temp  = temp0;

    }					

}

      }

 }

 p0 = voxel + (ptrdiff_t)(iz * slice + iy * imageSize.x + ix);	    

 output[tt] = p0[0];		

}

';[/codebox]