Asynchronous Memcpy's not overlapping with asynchronous kernel execution despite using cuda streams?

Hi, I’m a newcomer to CUDA. I’m trying to accelerate a program using cuda. When I’m trying to pipeline the workload using cuda stream, I find that the HtoD operations are not overlapping with kernel executions, and I can’t figure out why.
As shown below, HtoD memcpy’s are only overlapping with DtoH memcpy’s, but not kernels.

Below is the code. My GPU is RTX 2080Ti, system is Arch Linux, cuda version is 11.8.

#define TBLOCK_SZ 512
#define MAX_THREAD 1024
#define TILE_WIDTH 16
#define SHARED_MEM_SZ 32768

void ProcessCu( const size_t size,          // size of one chunk of data
                const u32 inputIdxStart,
                const u32 inputIdxEnd,
                const void *inputBuf, 
                const u32 outputIdxStart,
                const u32 outputIdxEnd,
                void *outputBuf )
{
  // CUDA Device compatible Galois type.
  typedef GaloisCu<G::Bits, G::Generator, G::ValueType> Gd;
  Gd::uploadTable();

  const u32 inCount = inputIdxEnd - inputIdxStart + 1;
  const u32 outCount = outputIdxEnd - outputIdxStart + 1;
  const u32 wordPerChunk = size / sizeof(Gd);
  // Batch need to be 4-byte aligned
  const u32 wordPerBatch = (TBLOCK_SZ * SHARED_MEM_SZ / ( MAX_THREAD * TILE_WIDTH * sizeof(Gd) ) - 1) & ~1;
  const u32 tileCount = inCount / TILE_WIDTH + ( inCount % TILE_WIDTH != 0 );
  const u32 batchCount = ceil( (float) wordPerChunk / wordPerBatch );

  // Allocate GPU memory buffers
  Gd *d_bases, *d_exponents;

  cudaErrchk( cudaMalloc( (void**) &d_bases, inCount * sizeof(Gd) ) );
  cudaErrchk( cudaMalloc( (void**) &d_exponents, outCount * sizeof(Gd) ) );

  // Copy bases and exponents to GPU
  u16 *baseOffset = &database[inputIdxStart];
  u16 *exponents = new u16[outCount];
  for ( int i = outputIdxStart; i <= outputIdxEnd; ++i ) {
    exponents[i - outputIdxStart] = outputrows[i].exponent;
  }

  cudaErrchk( cudaMemcpyAsync( d_bases, baseOffset, inCount * sizeof(Gd), cudaMemcpyHostToDevice ) );
  cudaErrchk( cudaMemcpyAsync( d_exponents, exponents, outCount * sizeof(u16), cudaMemcpyHostToDevice ) );
  cudaErrchk( cudaDeviceSynchronize() );
  delete [] exponents;

  // Set kernel launch parameters
  dim3 dimGrid( tileCount );
  dim3 dimBlock( TBLOCK_SZ );

  cudaStream_t *stream = new cudaStream_t[batchCount];
  Gd **d_input = new Gd*[batchCount];
  Gd **d_intermediate = new Gd*[batchCount];
  Gd **d_output = new Gd*[batchCount];
  
  for ( u32 i = 0; i < batchCount; ++i ) {
    // Create stream
    cudaErrchk( cudaStreamCreateWithFlags( &stream[i], cudaStreamNonBlocking ) );
    // Allocate device memory
    int batchSz = wordPerBatch;
    if ( i == batchCount - 1 ) {
      batchSz = wordPerChunk - i * wordPerBatch;
    }
    int batchSzAligned = batchSz + (batchSz & 1);
    cudaErrchk( cudaMalloc( (void**) &d_input[i], inCount * batchSz * sizeof(Gd) ) );
    cudaErrchk( cudaMalloc( (void**) &d_intermediate[i], tileCount * batchSzAligned * outCount * sizeof(Gd) ) );
    cudaErrchk( cudaMalloc( (void**) &d_output[i], outCount * batchSzAligned * sizeof(Gd) ) );

  }



  // Concurrent kernel invoking
  for ( u32 batchIdx = 0; batchIdx < batchCount; ++batchIdx ) {
    int batchSz = wordPerBatch;
    if ( batchIdx == batchCount - 1 ) {
      batchSz = wordPerChunk - batchIdx * wordPerBatch;
    }
    int batchSzAligned = batchSz + (batchSz & 1);

    // Copy input data to GPU
    for ( int i = 0; i < inCount; ++i ) {
      void *inputBufOffset = (char*) inputBuf + i * size + batchIdx * wordPerBatch * sizeof(G);
      void *d_inputBufOffset = (char*) d_input[batchIdx] + i * batchSz * sizeof(G);
      cudaErrchk( cudaMemcpyAsync( d_inputBufOffset, inputBufOffset, batchSz * sizeof(G), cudaMemcpyHostToDevice, stream[batchIdx] ) );
    }

    // Lauch Compute Kernel
    ProcessKer<<<dimGrid, dimBlock, (batchSzAligned + 1) * TILE_WIDTH * sizeof(G), stream[batchIdx]>>> 
    ( batchSz,
      d_input[batchIdx],
      d_bases,
      inCount,
      outCount,
      d_intermediate[batchIdx],
      d_exponents
    );
    // Lauch Reduce Kernel
    dim3 dimBlockReduce( 32 );
    dim3 dimGridReduce( ceil( outCount / (float) dimBlockReduce.x ), batchSzAligned / 2 );
    ReduceKer<<<dimGridReduce, dimBlockReduce, 0, stream[batchIdx]>>>
    ( (u32*) d_intermediate[batchIdx],
      (u32*) d_output[batchIdx],
      outCount,
      tileCount
    );

    // Copy Result to output buffer
    for ( int i = 0; i < outCount; ++i ){
      cudaErrchk( cudaMemcpyAsync( &((G*) outputBuf)[wordPerChunk * i + wordPerBatch * batchIdx],
                                   &d_output[batchIdx][batchSzAligned * i],
                                   batchSz * sizeof(GaloisCu16),
                                   cudaMemcpyDeviceToHost,
                                   stream[batchIdx] ) );
    }
    
  }
  cudaErrchk( cudaDeviceSynchronize() );
  
  for ( u32 i = 0; i < batchCount; ++i ) {
    // Destroy stream and free device memory
    cudaErrchk( cudaStreamDestroy( stream[i] ) );
    cudaFree( d_input[i] );
    cudaFree( d_intermediate[i] );
    cudaFree( d_output[i] );
  }

  // Free memory
  cudaFree( d_bases );
  cudaFree( d_exponents );
  delete[] stream;
  delete[] d_input;
  delete[] d_intermediate;
  delete[] d_output;
 
}

From the timeline one can see that cudaMemcpyAsync is blocking until the kernels are finished. This is always the case for device-to-host transfers where the host memory is pageable memory. In general, cudaMemcpyAsync to and from host memory can only be non-blocking without pageable memory, but with pinned host memory, regardless of copy direction

How are const void *inputBuf and void* outputBuf allocated?

I allocated them as pinned memory (I think):

  G *input, *outputGPU;
  u16 *exp;
  cudaErrchk( cudaMallocHost( (void**) &input, inCount * chunkSz ) );
  cudaErrchk( cudaMallocHost( (void**) &outputGPU, outCount * chunkSz ) );
  cudaErrchk( cudaMallocHost( (void**) &exp, outCount * sizeof(u16) ) );
  cudaErrchk( cudaMallocHost( (void**) &database, inCount * sizeof(u16) ) );
  memset( outputGPU, 0, outCount * chunkSz );
  
  // Code that load data into input, exp, database...

  ProcessCu( chunkSz, 0, inCount - 1, input, 0, outCount - 1, outputGPU );
  
  // Free memory
  cudaFreeHost( input );
  cudaFreeHost( outputGPU );
  cudaFreeHost( exp );
  cudaFreeHost( database );

I figured out the problem. For some reason the way I try to do D2H memcpy in for loop causes it to block even though the host memory is pinned. I used an additional host memory memory buffer so that the H2D and D2H memcpy’s are done in one go, and now different streams can execute concurrently.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.