Why is the transfer rate on gpu so much slower than on cpu when executing send and recv

cpu spend 0.006s when gpu spend 0.03s

cpu code

    if(down != MPI_PROC_NULL)
    {
        cudaMemcpy(DownSideEdge, d_v_p_dev + (I / ngpus) * J, sizeof(double) * J, cudaMemcpyDeviceToHost);
    }

    MPI_Sendrecv(DownSideEdge, I, MPI_DOUBLE, down, 0, 
                UpsideHaloEdge, I, MPI_DOUBLE, up, 0, MPI_COMM_WORLD, &status);
    
    if(up != MPI_PROC_NULL)
    {
        cudaMemcpy(d_v_p_dev, UpsideHaloEdge, sizeof(double) * J, cudaMemcpyHostToDevice);
    }

    if(up != MPI_PROC_NULL)
    {
        cudaMemcpy(UpSideEdge, d_v_p_dev + J, sizeof(double) * J, cudaMemcpyDeviceToHost);
    }

    MPI_Sendrecv(UpSideEdge, I, MPI_DOUBLE, up, 0, 
                 DownsideHaloEdge, I, MPI_DOUBLE, down, 0, MPI_COMM_WORLD, &status);

    if(down != MPI_PROC_NULL)
    {
        cudaMemcpy(d_v_p_dev + (I / ngpus + 1) * J, DownsideHaloEdge, sizeof(double) * J, cudaMemcpyHostToDevice);
    }

gpu code
SendDown<<<nBlocks_change, THREADSPB>>>(DownSideEdge_dev, d_v_p_dev, I, J, ngpus);

    MPI_Sendrecv(DownSideEdge_dev, I, MPI_DOUBLE, down, 0, 
      UpsideHaloEdge_dev, I, MPI_DOUBLE, up, 0, MPI_COMM_WORLD, &status);
    
    RecvUp<<<nBlocks_change, THREADSPB>>>(UpsideHaloEdge_dev, d_v_p_dev, I, J, ngpus);

    SendUp<<<nBlocks_change, THREADSPB>>>(UpSideEdge_dev, d_v_p_dev, I, J, ngpus);

    MPI_Sendrecv(UpSideEdge_dev, I, MPI_DOUBLE, up, 0, 
    DownsideHaloEdge_dev, I, MPI_DOUBLE, down, 0, MPI_COMM_WORLD, &status);

    RecvDown<<<nBlocks_change, THREADSPB>>>(DownsideHaloEdge_dev, d_v_p_dev, I, J, ngpus);

maybe your kernels take a long time to run