cudaMemcpy costs too much time

I’m developing a optimization algorithm with CUDA, the core code goes like

    // drrLength = 65536
    cudaMemcpy(d_pointIJK1, pointIJK1, drrLength * 3 * sizeof(float), cudaMemcpyKind::cudaMemcpyHostToDevice);
    cudaMemcpy(d_pointIJK2, pointIJK2, drrLength * 3 * sizeof(float), cudaMemcpyKind::cudaMemcpyHostToDevice);
    cudaMemcpy(d_cameraIJK1, camera1, 3 * sizeof(float), cudaMemcpyKind::cudaMemcpyHostToDevice);
    cudaMemcpy(d_cameraIJK2, camera2, 3 * sizeof(float), cudaMemcpyKind::cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (drrLength + threadsPerBlock - 1) / threadsPerBlock;

    RayCastInterpolate<<<blocksPerGrid, threadsPerBlock>>>(d_ct, ctSize[0], ctSize[1], ctSize[2], d_pointIJK1,
                                                           d_cameraIJK1, d_bounds, d_origins, d_normals, d_drrPtr1,
                                                           drrLength, 1.0, threshold, k, A, B);
    RayCastInterpolate<<<blocksPerGrid, threadsPerBlock>>>(d_ct, ctSize[0], ctSize[1], ctSize[2], d_pointIJK2,
                                                           d_cameraIJK2, d_bounds, d_origins, d_normals, d_drrPtr2,
                                                           drrLength, 1.0, threshold, k, A, B);
    short2ucharKernel<<<1, 1>>>(d_drrPtr1, d_drr1, drrLength);
    short2ucharKernel<<<1, 1>>>(d_drrPtr2, d_drr2, drrLength);

    dim3 blockSize(32, 32);
    dim3 gridSize((256 + blockSize.x - 1) / blockSize.x, (256 + blockSize.y - 1) / blockSize.y);
    metricKernel<<<gridSize, blockSize>>>(d_drr1, d_drr2, d_fixedSobelX1, d_fixedSobelY1, d_fixedMag1, d_fixedSobelX2,
                                          d_fixedSobelY2, d_fixedMag2, d_fixedMask1, d_fixedMask2, d_metric1,
                                          d_metric2);

    std::array<float, 65536> metric1, metric2;
    cudaMemcpy(metric1.data(), d_metric1, drrLength * sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost);
    cudaMemcpy(metric2.data(), d_metric2, drrLength * sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost);

The code piece above will run every iteration during the optimization process, and the optimization algorithm updates the input each iteration so I need to copy the input data from Host to Device and get the result from Device to Host.

However, when I use Nsight System to profile my program, it shows that when getting data from Device to Host, the first cudaMemcpy API costs 10000x more time than the kernel function.

How does this happen? Is there any solution to solve this problem?

As the kernels run asynchronously that time includes waiting for all the kernels to finish computing.