Rtx3070 Why when I run the program with CUDA Graph and multiple streams, there's no parallelism observed when monitoring with nsys

code

#include <cstdio>
#include <cuda_runtime.h>

#define CHECK_CUDA(call) { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        printf("CUDA Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
        exit(EXIT_FAILURE); \
    } \
}

__global__ void fma_kernel(float* a, float* b, float* c, float* result, int n, int iterations) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    float val_a = a[idx];
    float val_b = b[idx];
    float val_c = c[idx];
    float res = 0.0f;

    for (int i = 0; i < iterations; ++i) {
        res = val_a * val_b + val_c;
        val_a = res;
        val_b = res;
        val_c = res;
    }
    result[idx] = res;
}

int main() {
    const int n_streams = 4;
    const int data_size = 1 << 20;
    const int block_size = 256;
    const int grid_size = (data_size + block_size - 1) / block_size;
    const int iterations = 1000;

    printf("使用 CUDA Graph 配置: %d 个流, 每个流 %d 个元素, %d 次迭代\n", n_streams, data_size, iterations);

    // 1. 创建流和分配内存(每个流仍然需要独立的内存)
    cudaStream_t streams[n_streams];
    cudaGraph_t graphs[n_streams];
    cudaGraphExec_t graphInstances[n_streams];

    float *h_a[n_streams], *h_b[n_streams], *h_c[n_streams], *h_result[n_streams];
    float *d_a[n_streams], *d_b[n_streams], *d_c[n_streams], *d_result[n_streams];

    for (int i = 0; i < n_streams; ++i) {
        CHECK_CUDA(cudaStreamCreate(&streams[i]));
        CHECK_CUDA(cudaMallocHost(&h_a[i], data_size * sizeof(float)));
        CHECK_CUDA(cudaMallocHost(&h_b[i], data_size * sizeof(float)));
        CHECK_CUDA(cudaMallocHost(&h_c[i], data_size * sizeof(float)));
        CHECK_CUDA(cudaMallocHost(&h_result[i], data_size * sizeof(float)));

        for (int j = 0; j < data_size; ++j) {
            h_a[i][j] = static_cast<float>(j + 1 + i);
            h_b[i][j] = static_cast<float>(j + 2 + i);
            h_c[i][j] = static_cast<float>(j + 3 + i);
        }

        CHECK_CUDA(cudaMalloc(&d_a[i], data_size * sizeof(float)));
        CHECK_CUDA(cudaMalloc(&d_b[i], data_size * sizeof(float)));
        CHECK_CUDA(cudaMalloc(&d_c[i], data_size * sizeof(float)));
        CHECK_CUDA(cudaMalloc(&d_result[i], data_size * sizeof(float)));
    }

    // 2. 为每个流创建并实例化一个 Graph
    for (int i = 0; i < n_streams; ++i) {
        cudaGraph_t graph;
        CHECK_CUDA(cudaStreamBeginCapture(streams[i], cudaStreamCaptureModeGlobal));

        // 在流中执行操作,这些操作将被捕获到Graph中
        CHECK_CUDA(cudaMemcpyAsync(d_a[i], h_a[i], data_size * sizeof(float), cudaMemcpyHostToDevice, streams[i]));
        CHECK_CUDA(cudaMemcpyAsync(d_b[i], h_b[i], data_size * sizeof(float), cudaMemcpyHostToDevice, streams[i]));
        CHECK_CUDA(cudaMemcpyAsync(d_c[i], h_c[i], data_size * sizeof(float), cudaMemcpyHostToDevice, streams[i]));

        fma_kernel<<<grid_size, block_size, 0, streams[i]>>>(d_a[i], d_b[i], d_c[i], d_result[i], data_size, iterations);

        CHECK_CUDA(cudaMemcpyAsync(h_result[i], d_result[i], data_size * sizeof(float), cudaMemcpyDeviceToHost, streams[i]));

        CHECK_CUDA(cudaStreamEndCapture(streams[i], &graph)); // 结束捕获,得到Graph
        graphs[i] = graph;

        // 实例化Graph,创建一个可执行的Graph实例
        cudaGraphExec_t instance;
        CHECK_CUDA(cudaGraphInstantiate(&instance, graphs[i], NULL, NULL, 0));
        graphInstances[i] = instance;
    }

    // 3. 计时并执行Graph
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    CHECK_CUDA(cudaEventRecord(start, 0));

    // 一次性启动所有Graph!这是并发的关键。
    for (int i = 0; i < n_streams; ++i) {
        CHECK_CUDA(cudaGraphLaunch(graphInstances[i], streams[i]));
    }

    // 等待所有流执行完毕
    for (int i = 0; i < n_streams; ++i) {
        CHECK_CUDA(cudaStreamSynchronize(streams[i]));
    }

    CHECK_CUDA(cudaEventRecord(stop, 0));
    CHECK_CUDA(cudaEventSynchronize(stop));

    float elapsed_time;
    CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));
    printf("CUDA Graph 总执行时间: %.3f ms\n", elapsed_time);
    printf("吞吐量: %.3f GFLOP/s\n", (static_cast<double>(n_streams) * data_size * iterations * 2 * 1e-3) / elapsed_time);

    // 4. 清理资源
    for (int i = 0; i < n_streams; ++i) {
        CHECK_CUDA(cudaGraphExecDestroy(graphInstances[i]));
        CHECK_CUDA(cudaGraphDestroy(graphs[i]));
        CHECK_CUDA(cudaStreamDestroy(streams[i]));
        CHECK_CUDA(cudaFreeHost(h_a[i]));
        CHECK_CUDA(cudaFreeHost(h_b[i]));
        CHECK_CUDA(cudaFreeHost(h_c[i]));
        CHECK_CUDA(cudaFreeHost(h_result[i]));
        CHECK_CUDA(cudaFree(d_a[i]));
        CHECK_CUDA(cudaFree(d_b[i]));
        CHECK_CUDA(cudaFree(d_c[i]));
        CHECK_CUDA(cudaFree(d_result[i]));
    }
    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));

    return 0;
}

nsys

This is what I observe running on CUDA 13.0 on linux on a L4 GPU:


Its about what I would expect. I can’t explain your results. If you try running on linux it may help. You might also try updating to the latest CUDA version.

Thanks, Switching from CUDA 12.0 to 12.9 changed the graph execution from serial to parallel, but canceling the graph and using only the stream approach still results in serial execution.

coda

#include <cstdio>
#include <cuda_runtime.h>

#define CHECK_CUDA(call) { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        printf("CUDA Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
        exit(EXIT_FAILURE); \
    } \
}

__global__ void fma_kernel(float* a, float* b, float* c, float* result, int n, int iterations) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    float val_a = a[idx];
    float val_b = b[idx];
    float val_c = c[idx];
    float res = 0.0f;

    for (int i = 0; i < iterations; ++i) {
        res = val_a * val_b + val_c;
        val_a = res;
        val_b = res;
        val_c = res;
    }
    result[idx] = res;
}

int main() {
    const int n_streams = 4;
    const int data_size = 1 << 20;
    const int block_size = 256;
    const int grid_size = (data_size + block_size - 1) / block_size;
    const int iterations = 100000;

    printf("使用 CUDA Graph 配置: %d 个流, 每个流 %d 个元素, %d 次迭代\n", n_streams, data_size, iterations);

    // 1. 创建流和分配内存(每个流仍然需要独立的内存)
    cudaStream_t streams[n_streams];

    float *h_a[n_streams], *h_b[n_streams], *h_c[n_streams], *h_result[n_streams];
    float *d_a[n_streams], *d_b[n_streams], *d_c[n_streams], *d_result[n_streams];

    for (int i = 0; i < n_streams; ++i) {
        CHECK_CUDA(cudaStreamCreate(&streams[i]));
        CHECK_CUDA(cudaMallocHost(&h_a[i], data_size * sizeof(float)));
        CHECK_CUDA(cudaMallocHost(&h_b[i], data_size * sizeof(float)));
        CHECK_CUDA(cudaMallocHost(&h_c[i], data_size * sizeof(float)));
        CHECK_CUDA(cudaMallocHost(&h_result[i], data_size * sizeof(float)));

        for (int j = 0; j < data_size; ++j) {
            h_a[i][j] = static_cast<float>(j + 1 + i);
            h_b[i][j] = static_cast<float>(j + 2 + i);
            h_c[i][j] = static_cast<float>(j + 3 + i);
        }

        CHECK_CUDA(cudaMalloc(&d_a[i], data_size * sizeof(float)));
        CHECK_CUDA(cudaMalloc(&d_b[i], data_size * sizeof(float)));
        CHECK_CUDA(cudaMalloc(&d_c[i], data_size * sizeof(float)));
        CHECK_CUDA(cudaMalloc(&d_result[i], data_size * sizeof(float)));
    }

    // 2. 为每个流创建并实例化一个 Graph
    for (int i = 0; i < n_streams; ++i) {

        // 在流中执行操作,这些操作将被捕获到Graph中
        CHECK_CUDA(cudaMemcpyAsync(d_a[i], h_a[i], data_size * sizeof(float), cudaMemcpyHostToDevice, streams[i]));
        CHECK_CUDA(cudaMemcpyAsync(d_b[i], h_b[i], data_size * sizeof(float), cudaMemcpyHostToDevice, streams[i]));
        CHECK_CUDA(cudaMemcpyAsync(d_c[i], h_c[i], data_size * sizeof(float), cudaMemcpyHostToDevice, streams[i]));

        fma_kernel<<<grid_size, block_size, 0, streams[i]>>>(d_a[i], d_b[i], d_c[i], d_result[i], data_size, iterations);

        CHECK_CUDA(cudaMemcpyAsync(h_result[i], d_result[i], data_size * sizeof(float), cudaMemcpyDeviceToHost, streams[i]));

    }

    // 3. 计时并执行Graph
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    CHECK_CUDA(cudaEventRecord(start, 0));

    // 等待所有流执行完毕
    for (int i = 0; i < n_streams; ++i) {
        CHECK_CUDA(cudaStreamSynchronize(streams[i]));
    }

    CHECK_CUDA(cudaEventRecord(stop, 0));
    CHECK_CUDA(cudaEventSynchronize(stop));

    float elapsed_time;
    CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));
    printf("CUDA Graph 总执行时间: %.3f ms\n", elapsed_time);
    printf("吞吐量: %.3f GFLOP/s\n", (static_cast<double>(n_streams) * data_size * iterations * 2 * 1e-3) / elapsed_time);

    // 4. 清理资源
    for (int i = 0; i < n_streams; ++i) {
        CHECK_CUDA(cudaStreamDestroy(streams[i]));
        CHECK_CUDA(cudaFreeHost(h_a[i]));
        CHECK_CUDA(cudaFreeHost(h_b[i]));
        CHECK_CUDA(cudaFreeHost(h_c[i]));
        CHECK_CUDA(cudaFreeHost(h_result[i]));
        CHECK_CUDA(cudaFree(d_a[i]));
        CHECK_CUDA(cudaFree(d_b[i]));
        CHECK_CUDA(cudaFree(d_c[i]));
        CHECK_CUDA(cudaFree(d_result[i]));
    }
    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));

    return 0;
}

nsys

There is no reason to expect kernel overlap. Your data set size is 1048576. The RTX3070 has 46 SMs, each of which can hold at most 1536 threads resident. The total number of threads needed to process your data set is 1048576, which is greater than 1536x46 = 70,656. Therefore a single kernel launch will completely occupy the GPU for some time, preventing concurrent execution of kernels during that time.

In fact your nsys picture seems to show a small amount of kernel overlap. This is what I would expect. The data set size of 1048576 divided by the wave size of 70,656 gives 14.84 waves. So during the last wave, then the next kernel can start to execute.

This is all pretty much expected and you can read about some aspects of it in this blog.

Your nsys picture shows that there is overlap of some data copy operations with a kernel.

Different types of kernels do not run in parallel when using multiple streams. In the nsys results, it can be observed that operations of the same type, such as float4 and int4, execute in parallel, whereas operations of different types, such as float4 and int4, run serially.

code

#include <iostream>
#include <vector>
#include <cassert>

// 定义内核中使用的类型 T 为 float4,以实现向量化

#define REPEAT_TIMES 100000 // 定义内核中的循环次数

// 您的内核函数
template <class T>
__global__ void max_flops_vec4(uint64_t *startClk, uint64_t *stopClk, T *data1, T *data2, T *res, int n) {
  int gid = blockIdx.x * blockDim.x + threadIdx.x;
  if (gid >= n) return; // 防止越界

  T s1 = data1[gid];
  T s2 = data2[gid];
  T result = {0, 0, 0, 0};

  // synchronize all warps in the block
  __syncwarp(); // 更常见的做法是使用 __syncthreads() 同步整个线程块
                // __syncwarp() 仅同步当前 warp(32个线程)

  // start timing
  uint64_t start = 0;
  start = clock64(); // 使用 clock64() 而不是 clock()

  // 计算密集型循环
  #pragma unroll 32
  for (int i = 0; i < REPEAT_TIMES; ++i) {
    result.x += s1.x * s2.x;
    result.y += s1.y * s2.y;
    result.z += s1.z * s2.z;
    result.w += s1.w * s2.w;
    // 添加一些操作防止编译器过度优化
    s1.x += 0.01f;
    s2.x -= 0.01f;
  }

  // synchronize all warps in the block
  __syncwarp();

  // stop timing
  uint64_t stop = 0;
  stop = clock64();

  // write time and data back to memory
  startClk[gid] = start;
  stopClk[gid] = stop;
  res[gid] = result;
}

template <class T>
 void test()
 {
   const int num_streams = 4;   // 创建的 Stream 数量
  const int n = 1024;  // 每个 Stream 处理的数据量 (1M elements)
  const int threads_per_block = 256;
  const int blocks = (n + threads_per_block - 1) / threads_per_block;

  std::vector<cudaStream_t> streams(num_streams);
  // 为每个 Stream 分配主机和设备内存指针
  std::vector<uint64_t*> h_startClk(num_streams);
  std::vector<uint64_t*> h_stopClk(num_streams);
  std::vector<T*> h_data1(num_streams);
  std::vector<T*> h_data2(num_streams);
  std::vector<T*> h_res(num_streams);

  std::vector<uint64_t*> d_startClk(num_streams);
  std::vector<uint64_t*> d_stopClk(num_streams);
  std::vector<T*> d_data1(num_streams);
  std::vector<T*> d_data2(num_streams);
  std::vector<T*> d_res(num_streams);

  // 创建 Stream 并分配内存
  for (int i = 0; i < num_streams; ++i) {
    // 1. 创建 Stream
    cudaStreamCreate(&streams[i]);

    // 2. 分配主机锁页内存 (Pinned Memory),这对于异步拷贝至关重要
    cudaMallocHost(&h_startClk[i], n * sizeof(uint64_t));
    cudaMallocHost(&h_stopClk[i], n * sizeof(uint64_t));
    cudaMallocHost(&h_data1[i], n * sizeof(T));
    cudaMallocHost(&h_data2[i], n * sizeof(T));
    cudaMallocHost(&h_res[i], n * sizeof(T));

    // 3. 分配设备内存
    cudaMalloc(&d_startClk[i], n * sizeof(uint64_t));
    cudaMalloc(&d_stopClk[i], n * sizeof(uint64_t));
    cudaMalloc(&d_data1[i], n * sizeof(T));
    cudaMalloc(&d_data2[i], n * sizeof(T));
    cudaMalloc(&d_res[i], n * sizeof(T));

    // 4. 初始化主机数据
    for (int j = 0; j < n; ++j) {
        h_data1[i][j] = T{static_cast<float>(j), static_cast<float>(j+1), static_cast<float>(j+2), static_cast<float>(j+3)};
        h_data2[i][j] = T{static_cast<float>(j*0.5), static_cast<float>(j*0.5+1), static_cast<float>(j*0.5+2), static_cast<float>(j*0.5+3)};
    }
  }

  // 在每个 Stream 中异步启动任务序列
  for (int i = 0; i < num_streams; ++i) {
    // 异步拷贝:主机到设备 (H2D)
    cudaMemcpyAsync(d_data1[i], h_data1[i], n * sizeof(T), cudaMemcpyHostToDevice, streams[i]);
    cudaMemcpyAsync(d_data2[i], h_data2[i], n * sizeof(T), cudaMemcpyHostToDevice, streams[i]);

    // 异步内核启动
	if (i < num_streams / 2)
		max_flops_vec4<float4><<<blocks, threads_per_block, 0, streams[i]>>>(d_startClk[i], d_stopClk[i], d_data1[i], d_data2[i], d_res[i], n);
	else
		max_flops_vec4<int4><<<blocks, threads_per_block, 0, streams[i]>>>(d_startClk[i], d_stopClk[i], (int4*)d_data1[i], (int4*)d_data2[i], (int4*)d_res[i], n);
    // 异步拷贝:设备到主机 (D2H)
    cudaMemcpyAsync(h_startClk[i], d_startClk[i], n * sizeof(uint64_t), cudaMemcpyDeviceToHost, streams[i]);
    cudaMemcpyAsync(h_stopClk[i], d_stopClk[i], n * sizeof(uint64_t), cudaMemcpyDeviceToHost, streams[i]);
    cudaMemcpyAsync(h_res[i], d_res[i], n * sizeof(T), cudaMemcpyDeviceToHost, streams[i]);
  }

  // 等待所有 Stream 中的所有任务完成
  cudaDeviceSynchronize(); // 也可以使用 for-loop 和 cudaStreamSynchronize 分别等待每个 Stream

  // 检查错误
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) {
    std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl;
  }

  // 简单验证结果并输出第一个 Stream 的第一个元素的计时和结果
  std::cout << "Stream 0 - First element: " << std::endl;
  std::cout << "  Start Clock: " << h_startClk[0][0] << std::endl;
  std::cout << "  Stop Clock: " << h_stopClk[0][0] << std::endl;
  std::cout << "  Result: (" << h_res[0][0].x << ", " << h_res[0][0].y << ", " << h_res[0][0].z << ", " << h_res[0][0].w << ")" << std::endl;
  std::cout << "  Elapsed Cycles: " << h_stopClk[0][0] - h_startClk[0][0] << std::endl;

  // 清理资源
  for (int i = 0; i < num_streams; ++i) {
    cudaStreamDestroy(streams[i]);
    cudaFreeHost(h_startClk[i]);
    cudaFreeHost(h_stopClk[i]);
    cudaFreeHost(h_data1[i]);
    cudaFreeHost(h_data2[i]);
    cudaFreeHost(h_res[i]);
    cudaFree(d_startClk[i]);
    cudaFree(d_stopClk[i]);
    cudaFree(d_data1[i]);
    cudaFree(d_data2[i]);
    cudaFree(d_res[i]);
  }

  std::cout << "Multi-stream execution completed successfully." << std::endl;
 }
int main() {
  test<float4>();
  return 0;
}

nsys

It seems like we have a different code and a different question each time. I’m not sure I fully understand the current question:

However you may be running into a case of lazy loading. Before attempting to run two “different” kernels at the same time (or really many kinds of concurrency testing) you should ensure that you have the kernels properly loaded.

Thanks, Warm up and then you can start running parallel.