I’m a CUDA beginner, and I’ve encountered behavior that seems counter-intuitive. When I use multiple streams to execute the same kernel on different data, I observe excellent overlapping execution. However, when I launch different kernels (with no dependencies between them) in different streams, they execute mostly or completely serially, with minimal overlap.
According to my understanding, if kernel execution time is significantly longer than kernel launch overhead, shouldn’t we see near-complete overlap in both scenarios for maximum efficiency? But in my tests, when using multiple streams to asynchronously call independent kernels, they consistently execute almost serially, sometimes even completely serially.
Below is test code I’ve used, along with the nsys profiler screenshot. You can see that the third and fourth kernels should execute in parallel, but they’re almost entirely serialized.
#include <stdio.h>
#include <cuda_runtime.h>
#define N 1024*1024*512
#define THREADS_PER_BLOCK 1024
__global__ void kernel_square(float* data) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
float val = data[idx];
for(int i=0; i<500; i++) {
val = val * val;
}
data[idx] = val;
}
}
__global__ void kernel_cube(float* data) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
float val = data[idx];
for(int i=0; i<500; i++) {
val = val * val * val;
}
data[idx] = val;
}
}
int main() {
float *d_data1, *d_data2;
cudaStream_t stream1, stream2;
cudaEvent_t start, stop;
#define CHECK(call) { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
printf("CUDA Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(1); \
} \
}
CHECK(cudaMalloc(&d_data1, N*sizeof(float)));
CHECK(cudaMalloc(&d_data2, N*sizeof(float)));
CHECK(cudaStreamCreate(&stream1));
CHECK(cudaStreamCreate(&stream2));
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
kernel_square<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_data1);
CHECK(cudaGetLastError());
CHECK(cudaDeviceSynchronize());
kernel_cube<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_data2);
CHECK(cudaGetLastError());
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float sequential_time;
CHECK(cudaEventElapsedTime(&sequential_time, start, stop));
CHECK(cudaEventRecord(start));
kernel_square<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK, 0, stream1>>>(d_data1);
kernel_cube<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK, 0, stream2>>>(d_data2);
CHECK(cudaGetLastError());
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float concurrent_time;
CHECK(cudaEventElapsedTime(&concurrent_time, start, stop));
CHECK(cudaFree(d_data1));
CHECK(cudaFree(d_data2));
CHECK(cudaStreamDestroy(stream1));
CHECK(cudaStreamDestroy(stream2));
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
return 0;
}
For comparison, here’s another test where I call the same kernel type in different streams - you can see they overlap completely:(Sorry, as a new user I’m unable to insert multiple images. What I’m trying to show is two kernels executing with nearly perfect overlap.)
What’s causing this difference in behavior? Is there something I’m missing about how the CUDA scheduler works with different kernel types?
If I want to achieve nearly perfect overlapping execution when asynchronously calling different kernels, what approach should I take?