Here is some pieces of code how I use it. In the code I have the positions of particles defined double3 and each system has Np particles. I define arrays of pointers and I allocate them one by one on both host and device
#define nstr 3 // number of streams I use
int main(void)
{
double3 *pos[nstr];
double3 *dev_pos[nstr];
double *dev_newuuu[nstr],*dev_olduuu;
double *dev_charge;
double jump;
double *h_ene[nstr];
static int h_acc[1];
double *dev_energy[nstr],*dev_totalene;
int *dev_acceptance;
double3 jxyz[nstr];
double rnd[nstr];
int atom_i[nstr];
double enepene[nstr];
cudaStream_t stream[nstr]; // streams
// memory allocations
for (int is = 0; is < nstr; is++)
{
cudaStreamCreate(&stream[is]);
cudaMalloc(&dev_pos[is],sizeof(double3)*Np);
cudaMalloc(&dev_energy[is],sizeof(double));
cudaMalloc(&dev_newuuu[is],sizeof(double)*gss);
cudaHostAlloc(&pos[is],sizeof(double3)*Np,cudaHostAllocDefault);
cudaHostAlloc(&h_ene[is],sizeof(double),cudaHostAllocDefault);
}
float gputime;
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
init_config(pos,Np,lx,ly,lz,diamsq); // initilize the positions on the host
// copy the initial configurations to the device
for (int is = 0; is < nstr; is++)
{
cudaMemcpy(dev_pos[is],pos[is], sizeof(double3)*Np,cudaMemcpyHostToDevice);
}
// Exampl of how I run the strams in parallel
for(int ist=0;ist<nstr;ist++)
{
cudaMemcpy(dev_energy[ist], h_ene[ist], sizeof(double),cudaMemcpyHostToDevice);
}
cudaEventRecord(start,0);
for(int imes=0;imes<Neq;imes++)
{
h_acc[0]=0.0;
cudaMemcpy(dev_acceptance, h_acc, sizeof(int),cudaMemcpyHostToDevice);
for(int idl=0;idl<Nout;idl++)
{
for(int isp=0;isp<Np;isp++)
{
for(int ist=0;ist<nstr;ist++) // here calling the same function for different streams
{
jxyz[ist].x=jump*(2.0*genrand64_real2()-1.0);
jxyz[ist].y=jump*(2.0*genrand64_real2()-1.0);
jxyz[ist].z=jump*(2.0*genrand64_real2()-1.0);
atom_i[ist]=round((Np-1)*genrand64_real2());
rnd[ist]=genrand64_real2();
newMCenergyarray<<<gss,2*bsl,0,stream[ist]>>> (dev_pos[ist],dev_newuuu[ist], Np,jxyz[ist],atom_i[ist]); // first step
}
for(int ist=0;ist<nstr;ist++) // here calling the same function for different streams
{
vsu<<<1,1,0,stream[ist]>>>dev_pos[ist],dev_newuuu[ist],jxyz[ist],atom_i[ist],dev_acceptance,dev_energy[ist],rnd[ist]); // second step
}
}
}
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&gputime,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop) ;
printf(" \n");
printf("Time = %g \n", gputime/1000.0f);
printf(" \n");