cudaDeviceSynchronize error

I am performing cudaDeviceSynchronize immediately after kernel launch. For first iteration, it goes through all instruction and runs.
But for second iteration, I am getting cudaDeviceSynchronize error with error code 4.

Then I used __syncthreads() within the kernel code and again I got cudaDeviceSynchronize error with error code 30

But in Nsight debugging, the execution is successful every time. Can anybody please tell me what could be the reason for this?

Note: I have Tesla C2075 and Quadro 4000 GPU cards.

You might need to post sample code that reproduces the issue for anyone to comment.

Below given is the code sample in which I am getting cudaDeviceSynchronize error:

global void kernel1(CELL *cell, AUXCELL *auxcell, FLOAT *gm1n, FLOAT *gm1b2n, INT *dev_cells);
global void kernel2(RECINFO *recinfo, AUXCELL *auxcell, CELL *cell, FLOAT *wtxn0, FLOAT *wtxn1, FLOAT *wtxn2, FLOAT *wtxn3, FLOAT *wtxn4, FLOAT *wtyn0, FLOAT *wtyn1, FLOAT *wtyn2, FLOAT *wtyn3, FLOAT *wtyn4, FLOAT *wtzn0, FLOAT *wtzn1, FLOAT *wtzn2, FLOAT *wtzn3, FLOAT *wtzn4, INT *wtcelln0, INT *wtcelln1, INT *wtcelln2, INT *wtcelln3, INT *wtcelln4, INT *dev_cells);
global void kernel3(RECINFO *recinfo, INT *dev_cells);

void subroutine1(void)
{

extern INT cells;
extern AUXCELL *auxcell; //structure variable
extern CELL *cell; //structure variable
extern RECINFO *recinfo; //structure variable
extern COMMON_GASC common_gasc; //structure variable
FLOAT *gm1n = &common_gasc.gm1,*gm1b2n = &common_gasc.gm1b2;
size_t size= cells; CELL *dev_cell = 0; AUXCELL *dev_auxcell = 0;
FLOAT *dev_gm1n = 0,*dev_gm1b2n = 0; INT *p_cells,*dev_cells =0;
p_cells = & cells;

cudaStatus = cudaSetDevice(0);
cudaStatus = cudaMalloc((void**)&dev_cell, size * sizeof(CELL));
… // cudaMalloc for all other device variables
cudaStatus = cudaMemcpy(dev_cell, cell, size * sizeof(CELL), cudaMemcpyHostToDevice);
… // cudaMemcpy for all other device variables
kernel1<<<1, size>>>(dev_cell, dev_auxcell, dev_gm1n, dev_gm1b2n, dev_cells);
// size = 79
// here I am getting cudaDevice Synchronize error for second iteration (seond call to this subroutine in the code)
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaDeviceSynchronize returned error code %d after launching addKernel!\n”, cudaStatus);
goto Error1;
}
cudaStatus = cudaMemcpy(auxcell, dev_auxcell, size * sizeof(AUXCELL), cudaMemcpyDeviceToHost);

//declaration of other host & device variables for kernel2
// here I have not set the device since it has been set earlier in the code.
// here cudaMalloc has not been done for some of the device variables since it has been done earlier in the code before first kernel launch.
Kernel2<<<1, size>>>
(dev_recinfo, dev_auxcell, dev_cell, dev_wtxn0, dev_wtxn1, dev_wtxn2, dev_wtxn3, dev_wtxn4, dev_wtyn0, dev_wtyn1, dev_wtyn2, dev_wtyn3, dev_wtyn4, dev_wtzn0, dev_wtzn1, dev_wtzn2, dev_wtzn3, dev_wtzn4, dev_wtcelln0, dev_wtcelln1, dev_wtcelln2, dev_wtcelln3, dev_wtcelln4, dev_cells);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaDeviceSynchronize returned error code %d after launching addKernel!\n”, cudaStatus);
goto Error2;
}
… //host code
//here no cudaMalloc and cudaMemcpy has been done for third kernel launch, used
previously allocated device variables.

kernel3<<<1, size>>>(dev_recinfo, dev_cells);
cudaStatus = cudaDeviceSynchronize();

if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaDeviceSynchronize returned error code %d after launching addKernel!\n”, cudaStatus);
goto Error2;
}
Error2: … // cudaFree for device variables from kernel2 and kernel3.
Error1: … // cudaFree for device variables from kernel1
// cudaDeviceReset has been done only once at the end of the code.
cudaStatus = cudaDeviceReset();

}
}

output.bmp (2.14 MB)