cpu spend 0.006s when gpu spend 0.03s
cpu code
if(down != MPI_PROC_NULL)
{
cudaMemcpy(DownSideEdge, d_v_p_dev + (I / ngpus) * J, sizeof(double) * J, cudaMemcpyDeviceToHost);
}
MPI_Sendrecv(DownSideEdge, I, MPI_DOUBLE, down, 0,
UpsideHaloEdge, I, MPI_DOUBLE, up, 0, MPI_COMM_WORLD, &status);
if(up != MPI_PROC_NULL)
{
cudaMemcpy(d_v_p_dev, UpsideHaloEdge, sizeof(double) * J, cudaMemcpyHostToDevice);
}
if(up != MPI_PROC_NULL)
{
cudaMemcpy(UpSideEdge, d_v_p_dev + J, sizeof(double) * J, cudaMemcpyDeviceToHost);
}
MPI_Sendrecv(UpSideEdge, I, MPI_DOUBLE, up, 0,
DownsideHaloEdge, I, MPI_DOUBLE, down, 0, MPI_COMM_WORLD, &status);
if(down != MPI_PROC_NULL)
{
cudaMemcpy(d_v_p_dev + (I / ngpus + 1) * J, DownsideHaloEdge, sizeof(double) * J, cudaMemcpyHostToDevice);
}
gpu code
SendDown<<<nBlocks_change, THREADSPB>>>(DownSideEdge_dev, d_v_p_dev, I, J, ngpus);
MPI_Sendrecv(DownSideEdge_dev, I, MPI_DOUBLE, down, 0,
UpsideHaloEdge_dev, I, MPI_DOUBLE, up, 0, MPI_COMM_WORLD, &status);
RecvUp<<<nBlocks_change, THREADSPB>>>(UpsideHaloEdge_dev, d_v_p_dev, I, J, ngpus);
SendUp<<<nBlocks_change, THREADSPB>>>(UpSideEdge_dev, d_v_p_dev, I, J, ngpus);
MPI_Sendrecv(UpSideEdge_dev, I, MPI_DOUBLE, up, 0,
DownsideHaloEdge_dev, I, MPI_DOUBLE, down, 0, MPI_COMM_WORLD, &status);
RecvDown<<<nBlocks_change, THREADSPB>>>(DownsideHaloEdge_dev, d_v_p_dev, I, J, ngpus);