Time of cudaLaunch increase with the times of calling kernels.

I wrote a cuda program, which need to call several kernels in cycle.

I found that with the times of cycle, the time of cudaLaunch from nvprof becomes much longer.

Here are the examples.
when 13 times cycle, the cudaLaunch :
Time(%) Time Calls Avg Min Max Name
0.06% 4.9853ms 937 5.3200us 4.4040us 40.690us cudaLaunch

when 63 times cycle, the cudaLauch :
76.07% 30.3121s 4537 6.6811ms 6.5130us 15.109ms cudaLaunch

and when 125 times cycle, the cudaLauch:
87.75% 68.7761s 9001 7.6409ms 4.4660us 15.988ms cudaLaunch

Since much more cycles maybe needed. The time of cudaLaunch becomes most time-consuming one.

I’d like to know why and if there are some solutions.

Thank you !

I wouldn’t know how to diagnose this without access to the code and some additional data. Maybe someone else can make a guess. You can help them by telling us:

(1) CUDA version
(2) GPU used
(3) operating system

If the entire, buildable, code is not too big, I would suggest posting it.

Thank you!
This program is for a finite difference simulation of seismic wave.

(1)cuda version cuda-8.0
(2)GPU used titian XP
(3) operating system Centos 7

the code for cycling

for (int it = 0 ; it < timesteps ; it=it+8)
    {
        printf("\tt = %d ", it);

        // launch the kernel
        printf("launch kernel\n");
		//RK step1
		RK_init<<<dimGrid,dimBlock>>>(buW,bumW,butW,dimx,dimy,dimz);
		LxB_LyB_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_begin<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[1],RK4b[0],dimx,dimy,dimz);
		LxF_LyF_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[2],RK4b[1],dimx,dimy,dimz);
		LxB_LyB_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[3],RK4b[2],dimx,dimy,dimz);
		LxF_LyF_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_finish<<<dimGrid,dimBlock>>>(buW,buhW,butW,dt,RK4b[3],dimx,dimy,dimz);
		// step2
		RK_init<<<dimGrid,dimBlock>>>(buW,bumW,butW,dimx,dimy,dimz);
		LxF_LyF_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_begin<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[1],RK4b[0],dimx,dimy,dimz);
		LxB_LyB_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[2],RK4b[1],dimx,dimy,dimz);
		LxF_LyF_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[3],RK4b[2],dimx,dimy,dimz);
		LxB_LyB_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_finish<<<dimGrid,dimBlock>>>(buW,buhW,butW,dt,RK4b[3],dimx,dimy,dimz);
		// step3
		RK_init<<<dimGrid,dimBlock>>>(buW,bumW,butW,dimx,dimy,dimz);
		LxF_LyF_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_begin<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[1],RK4b[0],dimx,dimy,dimz);
		LxB_LyB_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[2],RK4b[1],dimx,dimy,dimz);
		LxF_LyF_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[3],RK4b[2],dimx,dimy,dimz);
		LxB_LyB_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_finish<<<dimGrid,dimBlock>>>(buW,buhW,butW,dt,RK4b[3],dimx,dimy,dimz);
		// step 4
		RK_init<<<dimGrid,dimBlock>>>(buW,bumW,butW,dimx,dimy,dimz);
		LxB_LyB_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_begin<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[1],RK4b[0],dimx,dimy,dimz);
		LxF_LyF_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[2],RK4b[1],dimx,dimy,dimz);
		LxB_LyB_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[3],RK4b[2],dimx,dimy,dimz);
		LxF_LyF_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_finish<<<dimGrid,dimBlock>>>(buW,buhW,butW,dt,RK4b[3],dimx,dimy,dimz);
		// step 5
		RK_init<<<dimGrid,dimBlock>>>(buW,bumW,butW,dimx,dimy,dimz);
		LxB_LyF_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_begin<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[1],RK4b[0],dimx,dimy,dimz);
		LxF_LyB_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[2],RK4b[1],dimx,dimy,dimz);
		LxB_LyF_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[3],RK4b[2],dimx,dimy,dimz);
		LxF_LyB_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_finish<<<dimGrid,dimBlock>>>(buW,buhW,butW,dt,RK4b[3],dimx,dimy,dimz);
		//step 6
		RK_init<<<dimGrid,dimBlock>>>(buW,bumW,butW,dimx,dimy,dimz);
		LxF_LyB_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_begin<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[1],RK4b[0],dimx,dimy,dimz);
		LxB_LyF_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[2],RK4b[1],dimx,dimy,dimz);
		LxF_LyB_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[3],RK4b[2],dimx,dimy,dimz);
		LxB_LyF_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_finish<<<dimGrid,dimBlock>>>(buW,buhW,butW,dt,RK4b[3],dimx,dimy,dimz);
		// step 7
		RK_init<<<dimGrid,dimBlock>>>(buW,bumW,butW,dimx,dimy,dimz);
		LxF_LyB_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_begin<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[1],RK4b[0],dimx,dimy,dimz);
		LxB_LyF_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[2],RK4b[1],dimx,dimy,dimz);
		LxF_LyB_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[3],RK4b[2],dimx,dimy,dimz);
		LxB_LyF_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_finish<<<dimGrid,dimBlock>>>(buW,buhW,butW,dt,RK4b[3],dimx,dimy,dimz);
		// step 8
		RK_init<<<dimGrid,dimBlock>>>(buW,bumW,butW,dimx,dimy,dimz);
		LxB_LyF_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_begin<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[1],RK4b[0],dimx,dimy,dimz);
		LxF_LyB_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[2],RK4b[1],dimx,dimy,dimz);
		LxB_LyF_LzF<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_inner<<<dimGrid,dimBlock>>>(buW,bumW,buhW,butW,dt,RK4a[3],RK4b[2],dimx,dimy,dimz);
		LxF_LyB_LzB<<<dimGrid,dimBlock>>>(buW,buhW,buM,buMd,dimx,dimy,dimz);
		RK_finish<<<dimGrid,dimBlock>>>(buW,buhW,butW,dt,RK4b[3],dimx,dimy,dimz);
    }

and for one of the kernal

__global__ void RK_begin(wave W, wave mW, wave hW, wave tW,float DT,float a, float b,float dimx,float dimy,float dimz){
	bool validr= true;
	const int gtidx = blockIdx.x * blockDim.x + threadIdx.x;
	const int gtidy = blockIdx.y * blockDim.y + threadIdx.y;	
	const int stride_y = dimx + 2 * RADIUS;
	const int stride_z = stride_y * (dimy + 2 * RADIUS);
	int indx = 0;
	indx += RADIUS * stride_y + RADIUS ;
	indx += gtidy * stride_y + gtidx;
	indx += stride_z*3;
    if ((gtidx >= dimx + RADIUS ) || (gtidy >= dimy + RADIUS ))
        validr = false;
	float rka,rkb;
	rka=a*DT;
	rkb=b*DT;
	for(int iz=0;iz<dimz;iz++){
		if(validr){
         W.Vx [indx] = mW.Vx [indx] + rka * hW.Vx [indx]; 
         W.Vy [indx] = mW.Vy [indx] + rka * hW.Vy [indx]; 
         W.Vz [indx] = mW.Vz [indx] + rka * hW.Vz [indx]; 
         W.Txx[indx] = mW.Txx[indx] + rka * hW.Txx[indx]; 
         W.Tyy[indx] = mW.Tyy[indx] + rka * hW.Tyy[indx]; 
         W.Tzz[indx] = mW.Tzz[indx] + rka * hW.Tzz[indx]; 
         W.Txy[indx] = mW.Txy[indx] + rka * hW.Txy[indx]; 
         W.Txz[indx] = mW.Txz[indx] + rka * hW.Txz[indx]; 
         W.Tyz[indx] = mW.Tyz[indx] + rka * hW.Tyz[indx]; 

        tW.Vx [indx] = mW.Vx [indx] + rkb * hW.Vx [indx]; 
        tW.Vy [indx] = mW.Vy [indx] + rkb * hW.Vy [indx]; 
        tW.Vz [indx] = mW.Vz [indx] + rkb * hW.Vz [indx]; 
        tW.Txx[indx] = mW.Txx[indx] + rkb * hW.Txx[indx]; 
        tW.Tyy[indx] = mW.Tyy[indx] + rkb * hW.Tyy[indx]; 
        tW.Tzz[indx] = mW.Tzz[indx] + rkb * hW.Tzz[indx]; 
        tW.Txy[indx] = mW.Txy[indx] + rkb * hW.Txy[indx]; 
        tW.Txz[indx] = mW.Txz[indx] + rkb * hW.Txz[indx]; 
        tW.Tyz[indx] = mW.Tyz[indx] + rkb * hW.Tyz[indx]; 
		indx += stride_z;
		}
	}	
	
}

And most of the matrix are stored in global memory.

Just a wild guess: You might be filling the kernel launch queue, after which the cuda launch calls become blocking calls.

Just a wild guess: You might be filling the kernel launch queue, after which the cuda launch calls become blocking calls.

Consider adding a cudaDeviceSynchronize() at the end of your loop - this will hopefully shift all synchronous API blocking to this call.

Yeah, may be you are right.

From the running output, the first 13 cycles is much faster than the following process. I don’t know a lot about the kernel launch system, maybe I should read some stuff about it.

It was already added.