Asynchronous Memcpy's not overlapping with asynchronous kernel execution despite using cuda streams?

PeiyanW · October 30, 2022, 11:56pm

Hi, I’m a newcomer to CUDA. I’m trying to accelerate a program using cuda. When I’m trying to pipeline the workload using cuda stream, I find that the HtoD operations are not overlapping with kernel executions, and I can’t figure out why.
As shown below, HtoD memcpy’s are only overlapping with DtoH memcpy’s, but not kernels.

Below is the code. My GPU is RTX 2080Ti, system is Arch Linux, cuda version is 11.8.

#define TBLOCK_SZ 512
#define MAX_THREAD 1024
#define TILE_WIDTH 16
#define SHARED_MEM_SZ 32768

void ProcessCu( const size_t size,          // size of one chunk of data
                const u32 inputIdxStart,
                const u32 inputIdxEnd,
                const void *inputBuf, 
                const u32 outputIdxStart,
                const u32 outputIdxEnd,
                void *outputBuf )
{
  // CUDA Device compatible Galois type.
  typedef GaloisCu<G::Bits, G::Generator, G::ValueType> Gd;
  Gd::uploadTable();

  const u32 inCount = inputIdxEnd - inputIdxStart + 1;
  const u32 outCount = outputIdxEnd - outputIdxStart + 1;
  const u32 wordPerChunk = size / sizeof(Gd);
  // Batch need to be 4-byte aligned
  const u32 wordPerBatch = (TBLOCK_SZ * SHARED_MEM_SZ / ( MAX_THREAD * TILE_WIDTH * sizeof(Gd) ) - 1) & ~1;
  const u32 tileCount = inCount / TILE_WIDTH + ( inCount % TILE_WIDTH != 0 );
  const u32 batchCount = ceil( (float) wordPerChunk / wordPerBatch );

  // Allocate GPU memory buffers
  Gd *d_bases, *d_exponents;

  cudaErrchk( cudaMalloc( (void**) &d_bases, inCount * sizeof(Gd) ) );
  cudaErrchk( cudaMalloc( (void**) &d_exponents, outCount * sizeof(Gd) ) );

  // Copy bases and exponents to GPU
  u16 *baseOffset = &database[inputIdxStart];
  u16 *exponents = new u16[outCount];
  for ( int i = outputIdxStart; i <= outputIdxEnd; ++i ) {
    exponents[i - outputIdxStart] = outputrows[i].exponent;
  }

  cudaErrchk( cudaMemcpyAsync( d_bases, baseOffset, inCount * sizeof(Gd), cudaMemcpyHostToDevice ) );
  cudaErrchk( cudaMemcpyAsync( d_exponents, exponents, outCount * sizeof(u16), cudaMemcpyHostToDevice ) );
  cudaErrchk( cudaDeviceSynchronize() );
  delete [] exponents;

  // Set kernel launch parameters
  dim3 dimGrid( tileCount );
  dim3 dimBlock( TBLOCK_SZ );

  cudaStream_t *stream = new cudaStream_t[batchCount];
  Gd **d_input = new Gd*[batchCount];
  Gd **d_intermediate = new Gd*[batchCount];
  Gd **d_output = new Gd*[batchCount];
  
  for ( u32 i = 0; i < batchCount; ++i ) {
    // Create stream
    cudaErrchk( cudaStreamCreateWithFlags( &stream[i], cudaStreamNonBlocking ) );
    // Allocate device memory
    int batchSz = wordPerBatch;
    if ( i == batchCount - 1 ) {
      batchSz = wordPerChunk - i * wordPerBatch;
    }
    int batchSzAligned = batchSz + (batchSz & 1);
    cudaErrchk( cudaMalloc( (void**) &d_input[i], inCount * batchSz * sizeof(Gd) ) );
    cudaErrchk( cudaMalloc( (void**) &d_intermediate[i], tileCount * batchSzAligned * outCount * sizeof(Gd) ) );
    cudaErrchk( cudaMalloc( (void**) &d_output[i], outCount * batchSzAligned * sizeof(Gd) ) );

  }



  // Concurrent kernel invoking
  for ( u32 batchIdx = 0; batchIdx < batchCount; ++batchIdx ) {
    int batchSz = wordPerBatch;
    if ( batchIdx == batchCount - 1 ) {
      batchSz = wordPerChunk - batchIdx * wordPerBatch;
    }
    int batchSzAligned = batchSz + (batchSz & 1);

    // Copy input data to GPU
    for ( int i = 0; i < inCount; ++i ) {
      void *inputBufOffset = (char*) inputBuf + i * size + batchIdx * wordPerBatch * sizeof(G);
      void *d_inputBufOffset = (char*) d_input[batchIdx] + i * batchSz * sizeof(G);
      cudaErrchk( cudaMemcpyAsync( d_inputBufOffset, inputBufOffset, batchSz * sizeof(G), cudaMemcpyHostToDevice, stream[batchIdx] ) );
    }

    // Lauch Compute Kernel
    ProcessKer<<<dimGrid, dimBlock, (batchSzAligned + 1) * TILE_WIDTH * sizeof(G), stream[batchIdx]>>> 
    ( batchSz,
      d_input[batchIdx],
      d_bases,
      inCount,
      outCount,
      d_intermediate[batchIdx],
      d_exponents
    );
    // Lauch Reduce Kernel
    dim3 dimBlockReduce( 32 );
    dim3 dimGridReduce( ceil( outCount / (float) dimBlockReduce.x ), batchSzAligned / 2 );
    ReduceKer<<<dimGridReduce, dimBlockReduce, 0, stream[batchIdx]>>>
    ( (u32*) d_intermediate[batchIdx],
      (u32*) d_output[batchIdx],
      outCount,
      tileCount
    );

    // Copy Result to output buffer
    for ( int i = 0; i < outCount; ++i ){
      cudaErrchk( cudaMemcpyAsync( &((G*) outputBuf)[wordPerChunk * i + wordPerBatch * batchIdx],
                                   &d_output[batchIdx][batchSzAligned * i],
                                   batchSz * sizeof(GaloisCu16),
                                   cudaMemcpyDeviceToHost,
                                   stream[batchIdx] ) );
    }
    
  }
  cudaErrchk( cudaDeviceSynchronize() );
  
  for ( u32 i = 0; i < batchCount; ++i ) {
    // Destroy stream and free device memory
    cudaErrchk( cudaStreamDestroy( stream[i] ) );
    cudaFree( d_input[i] );
    cudaFree( d_intermediate[i] );
    cudaFree( d_output[i] );
  }

  // Free memory
  cudaFree( d_bases );
  cudaFree( d_exponents );
  delete[] stream;
  delete[] d_input;
  delete[] d_intermediate;
  delete[] d_output;
 
}

striker159 · October 31, 2022, 5:49am

From the timeline one can see that cudaMemcpyAsync is blocking until the kernels are finished. This is always the case for device-to-host transfers where the host memory is pageable memory. In general, cudaMemcpyAsync to and from host memory can only be non-blocking without pageable memory, but with pinned host memory, regardless of copy direction

How are const void *inputBuf and void* outputBuf allocated?

PeiyanW · October 31, 2022, 7:09am

I allocated them as pinned memory (I think):

  G *input, *outputGPU;
  u16 *exp;
  cudaErrchk( cudaMallocHost( (void**) &input, inCount * chunkSz ) );
  cudaErrchk( cudaMallocHost( (void**) &outputGPU, outCount * chunkSz ) );
  cudaErrchk( cudaMallocHost( (void**) &exp, outCount * sizeof(u16) ) );
  cudaErrchk( cudaMallocHost( (void**) &database, inCount * sizeof(u16) ) );
  memset( outputGPU, 0, outCount * chunkSz );
  
  // Code that load data into input, exp, database...

  ProcessCu( chunkSz, 0, inCount - 1, input, 0, outCount - 1, outputGPU );
  
  // Free memory
  cudaFreeHost( input );
  cudaFreeHost( outputGPU );
  cudaFreeHost( exp );
  cudaFreeHost( database );

PeiyanW · October 31, 2022, 10:19am

I figured out the problem. For some reason the way I try to do D2H memcpy in for loop causes it to block even though the host memory is pinned. I used an additional host memory memory buffer so that the H2D and D2H memcpy’s are done in one go, and now different streams can execute concurrently.

Topic		Replies	Views
Memory copy/set async to kernel execution in different stream CUDA Programming and Performance	5	1282	December 15, 2022
Concurrent copy & execution problem Device to host memory copy is not overlapped with kernel exe CUDA Programming and Performance	1	1835	June 23, 2010
Questions about when using cudaMemcpyAsync(), the host is blocked CUDA Programming and Performance	6	3742	April 5, 2018
Asynchronous kernel execution and memory not overlapping using CUDA stream! CUDA Programming and Performance	3	964	July 7, 2017
How to overlap execution of kernels in different streams with copy operations CUDA Programming and Performance	9	1116	February 1, 2022
Asynchronous cudaMemcpy host to device not overlapping with kernel CUDA Programming and Performance	4	126	April 10, 2025
Asynchronous HtoD memtransfer need to have it asynchronous for cpu, but synchronous for the GPU CUDA Programming and Performance	6	1094	September 9, 2010
cudaMemcpyAsync Question Overlap HostToDevice and DeviceToHost trasfers CUDA Programming and Performance	2	5697	April 2, 2009
No Performance Improvement from Overlapping Kernel/Memcpy CUDA Programming and Performance	16	3325	July 14, 2010
Concurrent Kernel Execution / Memory Transfer We can't get it to work... CUDA Programming and Performance	5	4101	March 21, 2009

Asynchronous Memcpy's not overlapping with asynchronous kernel execution despite using cuda streams?

Related topics