code
#include <cstdio>
#include <cuda_runtime.h>
#define CHECK_CUDA(call) { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
printf("CUDA Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
}
__global__ void fma_kernel(float* a, float* b, float* c, float* result, int n, int iterations) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) return;
float val_a = a[idx];
float val_b = b[idx];
float val_c = c[idx];
float res = 0.0f;
for (int i = 0; i < iterations; ++i) {
res = val_a * val_b + val_c;
val_a = res;
val_b = res;
val_c = res;
}
result[idx] = res;
}
int main() {
const int n_streams = 4;
const int data_size = 1 << 20;
const int block_size = 256;
const int grid_size = (data_size + block_size - 1) / block_size;
const int iterations = 1000;
printf("使用 CUDA Graph 配置: %d 个流, 每个流 %d 个元素, %d 次迭代\n", n_streams, data_size, iterations);
// 1. 创建流和分配内存(每个流仍然需要独立的内存)
cudaStream_t streams[n_streams];
cudaGraph_t graphs[n_streams];
cudaGraphExec_t graphInstances[n_streams];
float *h_a[n_streams], *h_b[n_streams], *h_c[n_streams], *h_result[n_streams];
float *d_a[n_streams], *d_b[n_streams], *d_c[n_streams], *d_result[n_streams];
for (int i = 0; i < n_streams; ++i) {
CHECK_CUDA(cudaStreamCreate(&streams[i]));
CHECK_CUDA(cudaMallocHost(&h_a[i], data_size * sizeof(float)));
CHECK_CUDA(cudaMallocHost(&h_b[i], data_size * sizeof(float)));
CHECK_CUDA(cudaMallocHost(&h_c[i], data_size * sizeof(float)));
CHECK_CUDA(cudaMallocHost(&h_result[i], data_size * sizeof(float)));
for (int j = 0; j < data_size; ++j) {
h_a[i][j] = static_cast<float>(j + 1 + i);
h_b[i][j] = static_cast<float>(j + 2 + i);
h_c[i][j] = static_cast<float>(j + 3 + i);
}
CHECK_CUDA(cudaMalloc(&d_a[i], data_size * sizeof(float)));
CHECK_CUDA(cudaMalloc(&d_b[i], data_size * sizeof(float)));
CHECK_CUDA(cudaMalloc(&d_c[i], data_size * sizeof(float)));
CHECK_CUDA(cudaMalloc(&d_result[i], data_size * sizeof(float)));
}
// 2. 为每个流创建并实例化一个 Graph
for (int i = 0; i < n_streams; ++i) {
cudaGraph_t graph;
CHECK_CUDA(cudaStreamBeginCapture(streams[i], cudaStreamCaptureModeGlobal));
// 在流中执行操作,这些操作将被捕获到Graph中
CHECK_CUDA(cudaMemcpyAsync(d_a[i], h_a[i], data_size * sizeof(float), cudaMemcpyHostToDevice, streams[i]));
CHECK_CUDA(cudaMemcpyAsync(d_b[i], h_b[i], data_size * sizeof(float), cudaMemcpyHostToDevice, streams[i]));
CHECK_CUDA(cudaMemcpyAsync(d_c[i], h_c[i], data_size * sizeof(float), cudaMemcpyHostToDevice, streams[i]));
fma_kernel<<<grid_size, block_size, 0, streams[i]>>>(d_a[i], d_b[i], d_c[i], d_result[i], data_size, iterations);
CHECK_CUDA(cudaMemcpyAsync(h_result[i], d_result[i], data_size * sizeof(float), cudaMemcpyDeviceToHost, streams[i]));
CHECK_CUDA(cudaStreamEndCapture(streams[i], &graph)); // 结束捕获,得到Graph
graphs[i] = graph;
// 实例化Graph,创建一个可执行的Graph实例
cudaGraphExec_t instance;
CHECK_CUDA(cudaGraphInstantiate(&instance, graphs[i], NULL, NULL, 0));
graphInstances[i] = instance;
}
// 3. 计时并执行Graph
cudaEvent_t start, stop;
CHECK_CUDA(cudaEventCreate(&start));
CHECK_CUDA(cudaEventCreate(&stop));
CHECK_CUDA(cudaEventRecord(start, 0));
// 一次性启动所有Graph!这是并发的关键。
for (int i = 0; i < n_streams; ++i) {
CHECK_CUDA(cudaGraphLaunch(graphInstances[i], streams[i]));
}
// 等待所有流执行完毕
for (int i = 0; i < n_streams; ++i) {
CHECK_CUDA(cudaStreamSynchronize(streams[i]));
}
CHECK_CUDA(cudaEventRecord(stop, 0));
CHECK_CUDA(cudaEventSynchronize(stop));
float elapsed_time;
CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("CUDA Graph 总执行时间: %.3f ms\n", elapsed_time);
printf("吞吐量: %.3f GFLOP/s\n", (static_cast<double>(n_streams) * data_size * iterations * 2 * 1e-3) / elapsed_time);
// 4. 清理资源
for (int i = 0; i < n_streams; ++i) {
CHECK_CUDA(cudaGraphExecDestroy(graphInstances[i]));
CHECK_CUDA(cudaGraphDestroy(graphs[i]));
CHECK_CUDA(cudaStreamDestroy(streams[i]));
CHECK_CUDA(cudaFreeHost(h_a[i]));
CHECK_CUDA(cudaFreeHost(h_b[i]));
CHECK_CUDA(cudaFreeHost(h_c[i]));
CHECK_CUDA(cudaFreeHost(h_result[i]));
CHECK_CUDA(cudaFree(d_a[i]));
CHECK_CUDA(cudaFree(d_b[i]));
CHECK_CUDA(cudaFree(d_c[i]));
CHECK_CUDA(cudaFree(d_result[i]));
}
CHECK_CUDA(cudaEventDestroy(start));
CHECK_CUDA(cudaEventDestroy(stop));
return 0;
}
nsys




