CUDA Parallel Convolution Scheduling Issues(cuDNN)

lilil111 · April 26, 2025, 12:27pm

I am a CUDA beginner. I’m currently trying to automatically generate cuDNN calls through compilation, but I’ve encountered some problems.

I’ve implemented cuDNN runtime wrappers in MLIR(a compilation framework) to execute convolution operations at the LLVM IR(an intermediate representation) level. To maximize GPU resource utilization, I attempted to run two independent convolution operations in parallel using different streams and separate cuDNN handles. However, when analyzing with NSight Systems (nsys), I observed that the two convolution kernels don’t execute in parallel. There’s a significant idle time between kernels, and this idle interval is much longer than the actual kernel execution time.

Here’s the implementation of my cuDNN convolution wrapper function:

// Global CUDA context protection
static std::mutex g_cudnn_mutex;

// Stream to cuDNN handle mapping
static std::unordered_map<CUstream, cudnnHandle_t> g_stream_to_handle;

// Convolution wrapper function
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCudnnConv2dForward(
    int n, int c, int h, int w_in,              // Input dimensions
    int k, int r, int s,                         // Filter dimensions
    int pad_h, int pad_w,                        // Padding
    int stride_h, int stride_w,                  // Stride
    int dilation_h, int dilation_w,              // Dilation
    void* x_data, void* w_data, void* bias_data, // Data pointers
    void* y_data,                               // Output pointer
    CUstream stream                              // CUDA stream
) {
  ScopedContext scopedContext;
  
  // Get or create cuDNN handle associated with the stream
  cudnnHandle_t local_handle = nullptr;
  {
    std::lock_guard<std::mutex> lock(g_cudnn_mutex);
    auto it = g_stream_to_handle.find(stream);
    if (it == g_stream_to_handle.end()) {
      // Create new handle for this stream
      CUDNN_REPORT_IF_ERROR(cudnnCreate(&local_handle));
      CUDNN_REPORT_IF_ERROR(cudnnSetStream(local_handle, stream));
      g_stream_to_handle[stream] = local_handle;
    } else {
      local_handle = it->second;
    }
  }
  
  // Create descriptors
  cudnnTensorDescriptor_t xDesc, yDesc, biasDesc;
  cudnnFilterDescriptor_t wDesc;
  cudnnConvolutionDescriptor_t convDesc;
  
  CUDNN_REPORT_IF_ERROR(cudnnCreateTensorDescriptor(&xDesc));
  CUDNN_REPORT_IF_ERROR(cudnnCreateFilterDescriptor(&wDesc));
  CUDNN_REPORT_IF_ERROR(cudnnCreateTensorDescriptor(&yDesc));
  CUDNN_REPORT_IF_ERROR(cudnnCreateTensorDescriptor(&biasDesc));
  CUDNN_REPORT_IF_ERROR(cudnnCreateConvolutionDescriptor(&convDesc));
  
  // Set input descriptor
  CUDNN_REPORT_IF_ERROR(cudnnSetTensor4dDescriptor(
      xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w_in));
  
  // Set filter descriptor
  CUDNN_REPORT_IF_ERROR(cudnnSetFilter4dDescriptor(
      wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, k, c, r, s));
  
  // Set convolution descriptor
  CUDNN_REPORT_IF_ERROR(cudnnSetConvolution2dDescriptor(
      convDesc, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
      CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
  
  // Get output dimensions
  int out_n, out_c, out_h, out_w;
  CUDNN_REPORT_IF_ERROR(cudnnGetConvolution2dForwardOutputDim(
      convDesc, xDesc, wDesc, &out_n, &out_c, &out_h, &out_w));
  
  CUDNN_REPORT_IF_ERROR(cudnnSetTensor4dDescriptor(
      yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, out_n, out_c, out_h, out_w));
  
  // Set bias descriptor (1xCx1x1)
  CUDNN_REPORT_IF_ERROR(cudnnSetTensor4dDescriptor(
      biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, k, 1, 1));
  
  // Choose algorithm
  cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
  
  // Get workspace size
  size_t workspaceSize = 0;
  CUDNN_REPORT_IF_ERROR(cudnnGetConvolutionForwardWorkspaceSize(
      local_handle, xDesc, wDesc, convDesc, yDesc, algo, &workspaceSize));
  
  // Allocate workspace
  void* workspace = nullptr;
  if (workspaceSize > 0) {
    CUdeviceptr wsPtr = 0;
    CUDA_REPORT_IF_ERROR(cuMemAlloc(&wsPtr, workspaceSize));
    workspace = reinterpret_cast<void*>(wsPtr);
  }
  
  // Execute convolution
  const float alpha = 1.0f;
  const float beta = 0.0f;
  CUDNN_REPORT_IF_ERROR(cudnnConvolutionForward(
      local_handle, &alpha, xDesc, x_data, wDesc, w_data, convDesc, algo,
      workspace, workspaceSize, &beta, yDesc, y_data));
  
  // Add bias if provided
  if (bias_data != nullptr) {
    const float alpha_bias = 1.0f;
    const float beta_bias = 1.0f;
    CUDNN_REPORT_IF_ERROR(cudnnAddTensor(
        local_handle, &alpha_bias, biasDesc, bias_data, &beta_bias, yDesc, y_data));
  }
  
  // Free workspace
  if (workspace != nullptr) {
    CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(workspace)));
  }
  
  // Clean up descriptors
  CUDNN_REPORT_IF_ERROR(cudnnDestroyTensorDescriptor(xDesc));
  CUDNN_REPORT_IF_ERROR(cudnnDestroyFilterDescriptor(wDesc));
  CUDNN_REPORT_IF_ERROR(cudnnDestroyTensorDescriptor(&yDesc));
  CUDNN_REPORT_IF_ERROR(cudnnDestroyTensorDescriptor(biasDesc));
  CUDNN_REPORT_IF_ERROR(cudnnDestroyConvolutionDescriptor(convDesc));
}

I test this by creating two independent streams and calling the convolution operation on different streams:

// Create two streams for parallel execution
CUstream stream1 = nullptr, stream2 = nullptr;
CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream1, CU_STREAM_NON_BLOCKING));
CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream2, CU_STREAM_NON_BLOCKING));

// Allocate separate memory areas
// [Memory allocation code omitted]

// First convolution operation - using stream1
mgpuCudnnConv2dForward(n, c, h, w, k, r, s, pad_h, pad_w, stride_h, stride_w,
                       dilation_h, dilation_w, input1, filter1, bias1, output1, stream1);

// Second convolution operation - using stream2
mgpuCudnnConv2dForward(n, c, h, w, k, r, s, pad_h, pad_w, stride_h, stride_w,
                       dilation_h, dilation_w, input2, filter2, bias2, output2, stream2);

I analyzed the program execution using NSight Systems (nsys) and observed:

The two convolution kernels do not execute in parallel
There’s a substantial idle time interval between the kernels
This idle interval is much longer than the actual kernel execution time
Overall execution efficiency is low, with almost no parallel speedup

The left side of the blue box represents the computation time of the first CONV, and the right side represents the computation time of the second CONV.

I’d like to understand:

Is this behavior due to the small data size in the test?
Are there other factors affecting parallel execution?
How can I reduce the idle time between kernels and improve scheduling efficiency?
Are there any special settings required for parallel execution with cuDNN?

If I ultimately need to perform parallel scheduling for an entire computation graph, making independent kernels (corresponding to cudnn API functions) execute in parallel, while inserting synchronization points for dependent operations, the current situation suggests that the scheduling overhead seems too large. I’m wondering if, for the entire computation graph, we continue to see results similar to my current tests, then even after performing dependency analysis and parallel scheduling for the whole computation graph, it seems we might not gain much benefit.

mnicely · April 28, 2025, 3:33pm

First, I’m not sure you need multiple handles if these are being launched from a single CPU thread on a single GPU. Second, running multiple kernels in parallel can be affected by many aspects (most not library specific). You need to

check if the first kernels is using 100% of some resources that is not letting the second thread in parallel
check if the problem is big enough
launch latency of second kernel is preventing it from starting before the first is complete

lilil111 · April 29, 2025, 5:23am

Thank you for your reply!

I’m not very clear about the concept of handles.

Here’s my situation: I’m working in a single GPU scenario. The reason I tried using multiple handles was that I wanted to assign two convolution computations to different streams so they could execute in parallel.

However, using a single global handle and switching the stream in the handle when a convolution needs to execute also seems to be a solution. But I have a question about this approach: when switching the stream in a handle, do I need to wait for operations on the current stream to complete before switching? If so, it seems they would still execute sequentially.

Previously, when scheduling automatically generated kernels, I could directly assign streams. However, cuDNN API functions seem to only allow stream manipulation through handles, which confuses me. I don’t fully understand how to assign independent API functions to different streams so they can be called on different streams before execution, thus achieving parallel execution.

Topic		Replies	Views
Performance drops with dynamic parallelism CUDA Programming and Performance cuda , dynamic-control	12	612	June 3, 2024
unable to get the cpu and gpu to run in parallel CUDA Programming and Performance	34	23204	October 7, 2010
GPU Pro Tip: CUDA 7 Streams Simplify Concurrency Technical Blog	51	2117	February 5, 2020
Questions of CUDA stream priority CUDA Programming and Performance cuda	10	3825	April 19, 2023
Bug in cudaMemsetAsync or in Nsight VS Edition when visualizing cudaMemsetAsync execution CUDA Programming and Performance	13	1394	November 11, 2021
Cannot get any stream parallelism. CUDA Programming and Performance	13	1293	December 31, 2019
The Cuda 5 Second execution-time limit Finding a the way to work around the GDI timeout CUDA Programming and Performance	24	12717	July 26, 2010
cudaMemcpyAsync CUDA Programming and Performance	10	20827	October 16, 2015
Undeterministic 8-bit convolution output when channels are increased more than 4 with cudnnConvoluti... CUDA Programming and Performance	0	704	October 18, 2017
Why is my 'trivial' convolution kernel faster than cuDNN? CUDA Programming and Performance	4	462	May 29, 2022

CUDA Parallel Convolution Scheduling Issues(cuDNN)

Related topics