Thanks for the marco tera, after messing around with it, I’ve still ran a roadblock of data staying the same in the system. Here’s the new main code (the integrate function is still the same:
[codebox]for( int i = 0; i < ntime; i++){
cudaMemcpy(cpos, pos, sizeof(float)*3*man, cudaMemcpyHostToDevice);
cudaMemcpy(cxm, xm, sizeof(float)*3*man, cudaMemcpyHostToDevice);
cudaMemcpy(cforce, force, sizeof(float)*3*man, cudaMemcpyHostToDevice);
calc_force(forcer, pos,force,man,box, potene);
integrate <<<3*man, 1234567 >>> (cpos,force,dt,man, cxm, kinene, i, ntime);
cudaMemcpy(pos, cpos, sizeof(float)*3*man, cudaMemcpyDeviceToHost);
cudaMemcpy(xm, cxm, sizeof(float)*3*man, cudaMemcpyDeviceToHost);
cudaMemcpy(force, cforce, sizeof(float)*3*man, cudaMemcpyDeviceToHost);
for( int j=0; j< 3*man; i++){
cout <<pos[j];
}
}
global void integrate( double *pos, double *force, double dt, int man, double *xm, double ken, int j, int ntime)
{
double xx, vi;
ken = 0.0;
for( int i=0; i< 3*man; i++){
//timestep[i] = i*dt;
xx= 2*pos[i] - xm[i] + (dt*dt*force[i]);
vi = (xx-xm[i])/(2*dt);
ken += vi*vi;
xm[i] = pos[i];
pos[i] = xx;
// #cout << k << endl;
}
}[/codebox]
I think I may have to do a device to device copy, but I feel that would be inefficient and would not help one bit. Can anyone offer thoughts on this?