Below given is the code sample in which I am getting cudaDeviceSynchronize error:
global void kernel1(CELL *cell, AUXCELL *auxcell, FLOAT *gm1n, FLOAT *gm1b2n, INT *dev_cells);
global void kernel2(RECINFO *recinfo, AUXCELL *auxcell, CELL *cell, FLOAT *wtxn0, FLOAT *wtxn1, FLOAT *wtxn2, FLOAT *wtxn3, FLOAT *wtxn4, FLOAT *wtyn0, FLOAT *wtyn1, FLOAT *wtyn2, FLOAT *wtyn3, FLOAT *wtyn4, FLOAT *wtzn0, FLOAT *wtzn1, FLOAT *wtzn2, FLOAT *wtzn3, FLOAT *wtzn4, INT *wtcelln0, INT *wtcelln1, INT *wtcelln2, INT *wtcelln3, INT *wtcelln4, INT *dev_cells);
global void kernel3(RECINFO *recinfo, INT *dev_cells);
void subroutine1(void)
{
extern INT cells;
extern AUXCELL *auxcell; //structure variable
extern CELL *cell; //structure variable
extern RECINFO *recinfo; //structure variable
extern COMMON_GASC common_gasc; //structure variable
FLOAT *gm1n = &common_gasc.gm1,*gm1b2n = &common_gasc.gm1b2;
size_t size= cells; CELL *dev_cell = 0; AUXCELL *dev_auxcell = 0;
FLOAT *dev_gm1n = 0,*dev_gm1b2n = 0; INT *p_cells,*dev_cells =0;
p_cells = & cells;
cudaStatus = cudaSetDevice(0);
cudaStatus = cudaMalloc((void**)&dev_cell, size * sizeof(CELL));
… // cudaMalloc for all other device variables
cudaStatus = cudaMemcpy(dev_cell, cell, size * sizeof(CELL), cudaMemcpyHostToDevice);
… // cudaMemcpy for all other device variables
kernel1<<<1, size>>>(dev_cell, dev_auxcell, dev_gm1n, dev_gm1b2n, dev_cells);
// size = 79
// here I am getting cudaDevice Synchronize error for second iteration (seond call to this subroutine in the code)
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaDeviceSynchronize returned error code %d after launching addKernel!\n”, cudaStatus);
goto Error1;
}
cudaStatus = cudaMemcpy(auxcell, dev_auxcell, size * sizeof(AUXCELL), cudaMemcpyDeviceToHost);
… //declaration of other host & device variables for kernel2
// here I have not set the device since it has been set earlier in the code.
// here cudaMalloc has not been done for some of the device variables since it has been done earlier in the code before first kernel launch.
Kernel2<<<1, size>>>(dev_recinfo, dev_auxcell, dev_cell, dev_wtxn0, dev_wtxn1, dev_wtxn2, dev_wtxn3, dev_wtxn4, dev_wtyn0, dev_wtyn1, dev_wtyn2, dev_wtyn3, dev_wtyn4, dev_wtzn0, dev_wtzn1, dev_wtzn2, dev_wtzn3, dev_wtzn4, dev_wtcelln0, dev_wtcelln1, dev_wtcelln2, dev_wtcelln3, dev_wtcelln4, dev_cells);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaDeviceSynchronize returned error code %d after launching addKernel!\n”, cudaStatus);
goto Error2;
}
… //host code
//here no cudaMalloc and cudaMemcpy has been done for third kernel launch, used
previously allocated device variables.
kernel3<<<1, size>>>(dev_recinfo, dev_cells);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaDeviceSynchronize returned error code %d after launching addKernel!\n”, cudaStatus);
goto Error2;
}
Error2: … // cudaFree for device variables from kernel2 and kernel3.
Error1: … // cudaFree for device variables from kernel1
// cudaDeviceReset has been done only once at the end of the code.
cudaStatus = cudaDeviceReset();
}
}
output.bmp (2.14 MB)