I am a CUDA beginner. I’m currently trying to automatically generate cuDNN calls through compilation, but I’ve encountered some problems.
I’ve implemented cuDNN runtime wrappers in MLIR(a compilation framework) to execute convolution operations at the LLVM IR(an intermediate representation) level. To maximize GPU resource utilization, I attempted to run two independent convolution operations in parallel using different streams and separate cuDNN handles. However, when analyzing with NSight Systems (nsys), I observed that the two convolution kernels don’t execute in parallel. There’s a significant idle time between kernels, and this idle interval is much longer than the actual kernel execution time.
Here’s the implementation of my cuDNN convolution wrapper function:
// Global CUDA context protection
static std::mutex g_cudnn_mutex;
// Stream to cuDNN handle mapping
static std::unordered_map<CUstream, cudnnHandle_t> g_stream_to_handle;
// Convolution wrapper function
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCudnnConv2dForward(
int n, int c, int h, int w_in, // Input dimensions
int k, int r, int s, // Filter dimensions
int pad_h, int pad_w, // Padding
int stride_h, int stride_w, // Stride
int dilation_h, int dilation_w, // Dilation
void* x_data, void* w_data, void* bias_data, // Data pointers
void* y_data, // Output pointer
CUstream stream // CUDA stream
) {
ScopedContext scopedContext;
// Get or create cuDNN handle associated with the stream
cudnnHandle_t local_handle = nullptr;
{
std::lock_guard<std::mutex> lock(g_cudnn_mutex);
auto it = g_stream_to_handle.find(stream);
if (it == g_stream_to_handle.end()) {
// Create new handle for this stream
CUDNN_REPORT_IF_ERROR(cudnnCreate(&local_handle));
CUDNN_REPORT_IF_ERROR(cudnnSetStream(local_handle, stream));
g_stream_to_handle[stream] = local_handle;
} else {
local_handle = it->second;
}
}
// Create descriptors
cudnnTensorDescriptor_t xDesc, yDesc, biasDesc;
cudnnFilterDescriptor_t wDesc;
cudnnConvolutionDescriptor_t convDesc;
CUDNN_REPORT_IF_ERROR(cudnnCreateTensorDescriptor(&xDesc));
CUDNN_REPORT_IF_ERROR(cudnnCreateFilterDescriptor(&wDesc));
CUDNN_REPORT_IF_ERROR(cudnnCreateTensorDescriptor(&yDesc));
CUDNN_REPORT_IF_ERROR(cudnnCreateTensorDescriptor(&biasDesc));
CUDNN_REPORT_IF_ERROR(cudnnCreateConvolutionDescriptor(&convDesc));
// Set input descriptor
CUDNN_REPORT_IF_ERROR(cudnnSetTensor4dDescriptor(
xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w_in));
// Set filter descriptor
CUDNN_REPORT_IF_ERROR(cudnnSetFilter4dDescriptor(
wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, k, c, r, s));
// Set convolution descriptor
CUDNN_REPORT_IF_ERROR(cudnnSetConvolution2dDescriptor(
convDesc, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
// Get output dimensions
int out_n, out_c, out_h, out_w;
CUDNN_REPORT_IF_ERROR(cudnnGetConvolution2dForwardOutputDim(
convDesc, xDesc, wDesc, &out_n, &out_c, &out_h, &out_w));
CUDNN_REPORT_IF_ERROR(cudnnSetTensor4dDescriptor(
yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, out_n, out_c, out_h, out_w));
// Set bias descriptor (1xCx1x1)
CUDNN_REPORT_IF_ERROR(cudnnSetTensor4dDescriptor(
biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, k, 1, 1));
// Choose algorithm
cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
// Get workspace size
size_t workspaceSize = 0;
CUDNN_REPORT_IF_ERROR(cudnnGetConvolutionForwardWorkspaceSize(
local_handle, xDesc, wDesc, convDesc, yDesc, algo, &workspaceSize));
// Allocate workspace
void* workspace = nullptr;
if (workspaceSize > 0) {
CUdeviceptr wsPtr = 0;
CUDA_REPORT_IF_ERROR(cuMemAlloc(&wsPtr, workspaceSize));
workspace = reinterpret_cast<void*>(wsPtr);
}
// Execute convolution
const float alpha = 1.0f;
const float beta = 0.0f;
CUDNN_REPORT_IF_ERROR(cudnnConvolutionForward(
local_handle, &alpha, xDesc, x_data, wDesc, w_data, convDesc, algo,
workspace, workspaceSize, &beta, yDesc, y_data));
// Add bias if provided
if (bias_data != nullptr) {
const float alpha_bias = 1.0f;
const float beta_bias = 1.0f;
CUDNN_REPORT_IF_ERROR(cudnnAddTensor(
local_handle, &alpha_bias, biasDesc, bias_data, &beta_bias, yDesc, y_data));
}
// Free workspace
if (workspace != nullptr) {
CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(workspace)));
}
// Clean up descriptors
CUDNN_REPORT_IF_ERROR(cudnnDestroyTensorDescriptor(xDesc));
CUDNN_REPORT_IF_ERROR(cudnnDestroyFilterDescriptor(wDesc));
CUDNN_REPORT_IF_ERROR(cudnnDestroyTensorDescriptor(&yDesc));
CUDNN_REPORT_IF_ERROR(cudnnDestroyTensorDescriptor(biasDesc));
CUDNN_REPORT_IF_ERROR(cudnnDestroyConvolutionDescriptor(convDesc));
}
I test this by creating two independent streams and calling the convolution operation on different streams:
// Create two streams for parallel execution
CUstream stream1 = nullptr, stream2 = nullptr;
CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream1, CU_STREAM_NON_BLOCKING));
CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream2, CU_STREAM_NON_BLOCKING));
// Allocate separate memory areas
// [Memory allocation code omitted]
// First convolution operation - using stream1
mgpuCudnnConv2dForward(n, c, h, w, k, r, s, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, input1, filter1, bias1, output1, stream1);
// Second convolution operation - using stream2
mgpuCudnnConv2dForward(n, c, h, w, k, r, s, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, input2, filter2, bias2, output2, stream2);
I analyzed the program execution using NSight Systems (nsys) and observed:
- The two convolution kernels do not execute in parallel
- There’s a substantial idle time interval between the kernels
- This idle interval is much longer than the actual kernel execution time
- Overall execution efficiency is low, with almost no parallel speedup
The left side of the blue box represents the computation time of the first CONV, and the right side represents the computation time of the second CONV.
I’d like to understand:
- Is this behavior due to the small data size in the test?
- Are there other factors affecting parallel execution?
- How can I reduce the idle time between kernels and improve scheduling efficiency?
- Are there any special settings required for parallel execution with cuDNN?
If I ultimately need to perform parallel scheduling for an entire computation graph, making independent kernels (corresponding to cudnn API functions) execute in parallel, while inserting synchronization points for dependent operations, the current situation suggests that the scheduling overhead seems too large. I’m wondering if, for the entire computation graph, we continue to see results similar to my current tests, then even after performing dependency analysis and parallel scheduling for the whole computation graph, it seems we might not gain much benefit.