Kernel takes more and more time at each iteration

Hello,

For the end of my studies, I have to implement a DSMC on GPU by CUDA.

This is the kernel which gives problem.

global void CollGPU(mlcl mol, uint nbrecumule, float* delai,float* taille, float* sample, float c_r_max,int dt, int n_per_cell_ref, uint *appcell, uint *nummol, uint *m_mtstate, mt_struct_stripped *m_config, int *m_istate,float coco)
{
int i,j,index1,index2;
const int tid = blockDim.x
blockIdx.x + threadIdx.x;
cudahelp::rand::MTGenerator gen(tid);
float epsilon,chi,seps,schi,ceps,cchi,norm_c_r;
float B;
float n_aver_for_collision_interval, delta_t_coll;
v3 c_m,c_r,c_r_star;
float arret;
arret = float(1.0)/100;
coco[tid]=0;
int tps,T;
T=1;

	if(nbrecumule[tid+1]-nbrecumule[tid]>1 && tid<200)
	{
	///////////////Collisions
	//while (delai[tid]<=arret) {
		do {	
			
			index1 = nbrecumule[tid]+(nbrecumule[tid+1]-nbrecumule[tid])*gen.GetFloat();
			index2 = nbrecumule[tid]+(nbrecumule[tid+1]-nbrecumule[tid]-1)*gen.GetFloat();
    		index2 = (index2>=index1) ? (index2+1) : index2;
    
    		for (i=0;i<3;i++) {
    			c_m[i] = (mol[nummol[index1]].vit[i] + mol[nummol[index2]].vit[i])/2; 
    			c_r[i] = mol[nummol[index1]].vit[i] - mol[nummol[index2]].vit[i];
 	 		}

 	 		norm_c_r = sqrt(c_r[0]*c_r[0] + c_r[1]*c_r[1] + c_r[2]*c_r[2]);
			} while(norm_c_r<c_r_max*gen.GetFloat()); 
	
	
	epsilon = 2*gen.GetFloat()*float(M_PI);
	chi = acos(float(1.0) - 2*gen.GetFloat());
	seps = sin(epsilon);
	schi=sin(chi);
	cchi=cos(chi);
	ceps=cos(epsilon);
	
	B = sqrt(c_r[1]*c_r[1] + c_r[2]*c_r[2]);
	c_r_star[0] = cchi * c_r[0] + schi * seps * B;
	c_r_star[1] = cchi * c_r[1] + schi * (norm_c_r * c_r[2] * ceps - c_r[0] * c_r[1] * seps)/B;
	c_r_star[2] = cchi * c_r[2] - schi * (norm_c_r * c_r[1] * ceps + c_r[0] * c_r[2] * seps)/B;
	
	for (j=0;j<3;++j) {
    mol[nummol[index1]].vit[j] = (c_r_star[j] + 2*c_m[j])/2;
    mol[nummol[index2]].vit[j] = (-c_r_star[j] + 2*c_m[j])/2;
	}
	n_aver_for_collision_interval = (taille[tid] == 0) ? float((nbrecumule[tid+1]-nbrecumule[tid])) :(float(taille[tid]) / sample[tid]);
    delta_t_coll = (float(2.0)*float(M_SQRT2)/float((nbrecumule[tid+1]-nbrecumule[tid])))*float(n_per_cell_ref)/float(n_aver_for_collision_interval * norm_c_r);
    delai[tid] += delta_t_coll;  
}
delai[tid] = delai[tid] - arret;
	}	
}

I suppose that it’s the generation of pseudo random numbers which is the problem.

For exemple the first 1000 iterations takes 15ms
But the 2000 iterations take 250 ms …

I don’t understand why it takes so long time but not at the beginning.

Thank you for your help and sorry for my poor english.