How can i program to make memcpy and kernel overlaped?

Here is my code.

#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>

using FLOAT = float;
using std::cout;
using std::endl;

__global__ void vec_add(FLOAT* x, FLOAT* y, FLOAT* z, int N)
{
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    for (int i = idx; i < N; i += gridDim.x * blockDim.x) {
        z[i] = y[i] + x[i];
    }
}

void vec_add_cpu(FLOAT* x, FLOAT* y, FLOAT* z, int N)
{
    for (int i = 0; i < N; i++) {
        z[i] = y[i] + x[i];
    }
}

int main()
{
    int N = 1000000;
    const int nstreams = 1;
    int nums_per_stream = N / nstreams;
    N = nums_per_stream * nstreams;
    int nbytes = N * sizeof(FLOAT);
    int size_per_stream = nums_per_stream * sizeof(FLOAT);
    int WARMUP = 2;
    int NREPEATS = 128;

    int bs = 256;

    int s = ceil((nums_per_stream + bs - 1.) / bs);
    dim3 grid(s);

    FLOAT* dx, * hx;
    FLOAT* dy, * hy;
    FLOAT* dz, * hz;

    cudaMalloc((void**)&dx, nbytes);
    cudaMalloc((void**)&dy, nbytes);
    cudaMalloc((void**)&dz, nbytes);

    cudaHostAlloc(&hx, nbytes, cudaHostAllocDefault);
    cudaHostAlloc(&hy, nbytes, cudaHostAllocDefault);
    cudaHostAlloc(&hz, nbytes, cudaHostAllocDefault);

    for (int i = 0; i < N; i++) {
        hx[i] = 1.0;
        hy[i] = 1.0;
    }
    cudaStream_t streams[nstreams];

    for (int i = 0; i < nstreams; i++) {
        cudaStreamCreate(&streams[i]);
    }

    for (size_t round = 0; round < WARMUP; round++)
    {
        for (int i = 0; i < nstreams; i++) {
            int start_per_stream = i * nums_per_stream;
            cudaMemcpyAsync(dx + start_per_stream, hx + start_per_stream, size_per_stream, cudaMemcpyHostToDevice, streams[i]);
            cudaMemcpyAsync(dy + start_per_stream, hy + start_per_stream, size_per_stream, cudaMemcpyHostToDevice, streams[i]);
            vec_add << <grid, bs, 0, streams[i] >> > (dx + start_per_stream, dy + start_per_stream, dz + start_per_stream, nums_per_stream);
            cudaMemcpyAsync(hz + start_per_stream, dz + start_per_stream, size_per_stream, cudaMemcpyDeviceToHost, streams[i]);
        }
    }

    float milliseconds = 0;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    for (size_t round = 0; round < NREPEATS; round++)
    {
        for (int i = 0; i < nstreams; i++) {
            int start_per_stream = i * nums_per_stream;
            cudaMemcpyAsync(dx + start_per_stream, hx + start_per_stream, size_per_stream, cudaMemcpyHostToDevice, streams[i]);
            cudaMemcpyAsync(dy + start_per_stream, hy + start_per_stream, size_per_stream, cudaMemcpyHostToDevice, streams[i]);
            vec_add << <grid, bs, 0, streams[i] >> > (dx + start_per_stream, dy + start_per_stream, dz + start_per_stream, nums_per_stream);
            cudaMemcpyAsync(hz + start_per_stream, dz + start_per_stream, size_per_stream, cudaMemcpyDeviceToHost, streams[i]);
        }
    }

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    float time = milliseconds / NREPEATS;
    cout << nstreams << " streams time cost:\t" << time << " ms\t" << endl;

    FLOAT* hz_cpu_res = (FLOAT*)malloc(nbytes);
    vec_add_cpu(hx, hy, hz_cpu_res, N);
    for (int i = 0; i < N; ++i) {
        if (fabs(hz_cpu_res[i] - hz[i]) > 1e-6) {
            printf("index: %d, cpu: %f, gpu: %f\n", i, hz_cpu_res[i], hz[i]);
            break;
        }
    }
    for (int i = 0; i < nstreams; i++) {
        cudaStreamDestroy(streams[i]);
    }
    cudaFree(dx);
    cudaFree(dy);
    cudaFree(dz);
    cudaFreeHost(hx);
    cudaFreeHost(hy);
    cudaFreeHost(hz);
    free(hz_cpu_res);

    return 0;
}

the profiling result is
0.58876 ms - 1 streams
0.660088 ms - 2 streams
0.744426 ms - 3 streams
0.843017 ms - 4 streams

the nsys analysis timeline looks like

there is no overlap between h2d, kernel and d2h. How can i program to make it happen?

OS: windows 11
GPU: RTX 3080(10GB)
CUDA: 13.1
Driver: 596.21

Here’s LLM’s explaination.

On Windows, GeForce gaming cards use WDDM (Windows Display Driver Model).
To ensure the responsiveness of the desktop and graphics applications, the driver batches all CUDA commands. Even if you submit them asynchronously in different streams, they are often serialized during hardware execution.

Is it correct? (I don’t have Linux to verify it now)

your code shows:

using only a single stream will serialize everything. (Yes, I acknowledge that you are probably changing this. Just pointing it out for completeness/future readers.)

There are (at least) 2 rules for streams:

  1. Items issued into the same stream will execute in issue order.
  2. Items issued into separate (non-NULL) streams have no ordering relationship defined by CUDA.
    You’ll need to issue items into separate (non-NULL) streams if you want to see overlap.

I would also point out that your kernel duration appears to be quite short, less than 20 microseconds. That’s going to be difficult to witness overlap with other operations.

And its possible that windows batching may also be a factor. You could try both settings of Windows Hardware Accelerated GPU Scheduling to see if it has any effect on your observation(s).

Here is a section from an nsys profile result of your code running on a linux machine (L4 GPU, CUDA 13.0). The only change I made to your code was to set nstreams to 4:

You can see multiple forms of concurrency there, including copy/compute overlap, and also D2H/H2D overlap, between the various streams of activity.

So I would guess that the issue on your side has to do with the Windows OS.

Thanks Robert.