Here is my code.
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
using FLOAT = float;
using std::cout;
using std::endl;
__global__ void vec_add(FLOAT* x, FLOAT* y, FLOAT* z, int N)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = idx; i < N; i += gridDim.x * blockDim.x) {
z[i] = y[i] + x[i];
}
}
void vec_add_cpu(FLOAT* x, FLOAT* y, FLOAT* z, int N)
{
for (int i = 0; i < N; i++) {
z[i] = y[i] + x[i];
}
}
int main()
{
int N = 1000000;
const int nstreams = 1;
int nums_per_stream = N / nstreams;
N = nums_per_stream * nstreams;
int nbytes = N * sizeof(FLOAT);
int size_per_stream = nums_per_stream * sizeof(FLOAT);
int WARMUP = 2;
int NREPEATS = 128;
int bs = 256;
int s = ceil((nums_per_stream + bs - 1.) / bs);
dim3 grid(s);
FLOAT* dx, * hx;
FLOAT* dy, * hy;
FLOAT* dz, * hz;
cudaMalloc((void**)&dx, nbytes);
cudaMalloc((void**)&dy, nbytes);
cudaMalloc((void**)&dz, nbytes);
cudaHostAlloc(&hx, nbytes, cudaHostAllocDefault);
cudaHostAlloc(&hy, nbytes, cudaHostAllocDefault);
cudaHostAlloc(&hz, nbytes, cudaHostAllocDefault);
for (int i = 0; i < N; i++) {
hx[i] = 1.0;
hy[i] = 1.0;
}
cudaStream_t streams[nstreams];
for (int i = 0; i < nstreams; i++) {
cudaStreamCreate(&streams[i]);
}
for (size_t round = 0; round < WARMUP; round++)
{
for (int i = 0; i < nstreams; i++) {
int start_per_stream = i * nums_per_stream;
cudaMemcpyAsync(dx + start_per_stream, hx + start_per_stream, size_per_stream, cudaMemcpyHostToDevice, streams[i]);
cudaMemcpyAsync(dy + start_per_stream, hy + start_per_stream, size_per_stream, cudaMemcpyHostToDevice, streams[i]);
vec_add << <grid, bs, 0, streams[i] >> > (dx + start_per_stream, dy + start_per_stream, dz + start_per_stream, nums_per_stream);
cudaMemcpyAsync(hz + start_per_stream, dz + start_per_stream, size_per_stream, cudaMemcpyDeviceToHost, streams[i]);
}
}
float milliseconds = 0;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
for (size_t round = 0; round < NREPEATS; round++)
{
for (int i = 0; i < nstreams; i++) {
int start_per_stream = i * nums_per_stream;
cudaMemcpyAsync(dx + start_per_stream, hx + start_per_stream, size_per_stream, cudaMemcpyHostToDevice, streams[i]);
cudaMemcpyAsync(dy + start_per_stream, hy + start_per_stream, size_per_stream, cudaMemcpyHostToDevice, streams[i]);
vec_add << <grid, bs, 0, streams[i] >> > (dx + start_per_stream, dy + start_per_stream, dz + start_per_stream, nums_per_stream);
cudaMemcpyAsync(hz + start_per_stream, dz + start_per_stream, size_per_stream, cudaMemcpyDeviceToHost, streams[i]);
}
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
float time = milliseconds / NREPEATS;
cout << nstreams << " streams time cost:\t" << time << " ms\t" << endl;
FLOAT* hz_cpu_res = (FLOAT*)malloc(nbytes);
vec_add_cpu(hx, hy, hz_cpu_res, N);
for (int i = 0; i < N; ++i) {
if (fabs(hz_cpu_res[i] - hz[i]) > 1e-6) {
printf("index: %d, cpu: %f, gpu: %f\n", i, hz_cpu_res[i], hz[i]);
break;
}
}
for (int i = 0; i < nstreams; i++) {
cudaStreamDestroy(streams[i]);
}
cudaFree(dx);
cudaFree(dy);
cudaFree(dz);
cudaFreeHost(hx);
cudaFreeHost(hy);
cudaFreeHost(hz);
free(hz_cpu_res);
return 0;
}
the profiling result is
0.58876 ms - 1 streams
0.660088 ms - 2 streams
0.744426 ms - 3 streams
0.843017 ms - 4 streams
the nsys analysis timeline looks like
there is no overlap between h2d, kernel and d2h. How can i program to make it happen?
OS: windows 11
GPU: RTX 3080(10GB)
CUDA: 13.1
Driver: 596.21
Here’s LLM’s explaination.
On Windows, GeForce gaming cards use WDDM (Windows Display Driver Model).
To ensure the responsiveness of the desktop and graphics applications, the driver batches all CUDA commands. Even if you submit them asynchronously in different streams, they are often serialized during hardware execution.
Is it correct? (I don’t have Linux to verify it now)

