Hi, I’m a newcomer to CUDA. I’m trying to accelerate a program using cuda. When I’m trying to pipeline the workload using cuda stream, I find that the HtoD operations are not overlapping with kernel executions, and I can’t figure out why.
As shown below, HtoD memcpy’s are only overlapping with DtoH memcpy’s, but not kernels.
Below is the code. My GPU is RTX 2080Ti, system is Arch Linux, cuda version is 11.8.
#define TBLOCK_SZ 512
#define MAX_THREAD 1024
#define TILE_WIDTH 16
#define SHARED_MEM_SZ 32768
void ProcessCu( const size_t size, // size of one chunk of data
const u32 inputIdxStart,
const u32 inputIdxEnd,
const void *inputBuf,
const u32 outputIdxStart,
const u32 outputIdxEnd,
void *outputBuf )
{
// CUDA Device compatible Galois type.
typedef GaloisCu<G::Bits, G::Generator, G::ValueType> Gd;
Gd::uploadTable();
const u32 inCount = inputIdxEnd - inputIdxStart + 1;
const u32 outCount = outputIdxEnd - outputIdxStart + 1;
const u32 wordPerChunk = size / sizeof(Gd);
// Batch need to be 4-byte aligned
const u32 wordPerBatch = (TBLOCK_SZ * SHARED_MEM_SZ / ( MAX_THREAD * TILE_WIDTH * sizeof(Gd) ) - 1) & ~1;
const u32 tileCount = inCount / TILE_WIDTH + ( inCount % TILE_WIDTH != 0 );
const u32 batchCount = ceil( (float) wordPerChunk / wordPerBatch );
// Allocate GPU memory buffers
Gd *d_bases, *d_exponents;
cudaErrchk( cudaMalloc( (void**) &d_bases, inCount * sizeof(Gd) ) );
cudaErrchk( cudaMalloc( (void**) &d_exponents, outCount * sizeof(Gd) ) );
// Copy bases and exponents to GPU
u16 *baseOffset = &database[inputIdxStart];
u16 *exponents = new u16[outCount];
for ( int i = outputIdxStart; i <= outputIdxEnd; ++i ) {
exponents[i - outputIdxStart] = outputrows[i].exponent;
}
cudaErrchk( cudaMemcpyAsync( d_bases, baseOffset, inCount * sizeof(Gd), cudaMemcpyHostToDevice ) );
cudaErrchk( cudaMemcpyAsync( d_exponents, exponents, outCount * sizeof(u16), cudaMemcpyHostToDevice ) );
cudaErrchk( cudaDeviceSynchronize() );
delete [] exponents;
// Set kernel launch parameters
dim3 dimGrid( tileCount );
dim3 dimBlock( TBLOCK_SZ );
cudaStream_t *stream = new cudaStream_t[batchCount];
Gd **d_input = new Gd*[batchCount];
Gd **d_intermediate = new Gd*[batchCount];
Gd **d_output = new Gd*[batchCount];
for ( u32 i = 0; i < batchCount; ++i ) {
// Create stream
cudaErrchk( cudaStreamCreateWithFlags( &stream[i], cudaStreamNonBlocking ) );
// Allocate device memory
int batchSz = wordPerBatch;
if ( i == batchCount - 1 ) {
batchSz = wordPerChunk - i * wordPerBatch;
}
int batchSzAligned = batchSz + (batchSz & 1);
cudaErrchk( cudaMalloc( (void**) &d_input[i], inCount * batchSz * sizeof(Gd) ) );
cudaErrchk( cudaMalloc( (void**) &d_intermediate[i], tileCount * batchSzAligned * outCount * sizeof(Gd) ) );
cudaErrchk( cudaMalloc( (void**) &d_output[i], outCount * batchSzAligned * sizeof(Gd) ) );
}
// Concurrent kernel invoking
for ( u32 batchIdx = 0; batchIdx < batchCount; ++batchIdx ) {
int batchSz = wordPerBatch;
if ( batchIdx == batchCount - 1 ) {
batchSz = wordPerChunk - batchIdx * wordPerBatch;
}
int batchSzAligned = batchSz + (batchSz & 1);
// Copy input data to GPU
for ( int i = 0; i < inCount; ++i ) {
void *inputBufOffset = (char*) inputBuf + i * size + batchIdx * wordPerBatch * sizeof(G);
void *d_inputBufOffset = (char*) d_input[batchIdx] + i * batchSz * sizeof(G);
cudaErrchk( cudaMemcpyAsync( d_inputBufOffset, inputBufOffset, batchSz * sizeof(G), cudaMemcpyHostToDevice, stream[batchIdx] ) );
}
// Lauch Compute Kernel
ProcessKer<<<dimGrid, dimBlock, (batchSzAligned + 1) * TILE_WIDTH * sizeof(G), stream[batchIdx]>>>
( batchSz,
d_input[batchIdx],
d_bases,
inCount,
outCount,
d_intermediate[batchIdx],
d_exponents
);
// Lauch Reduce Kernel
dim3 dimBlockReduce( 32 );
dim3 dimGridReduce( ceil( outCount / (float) dimBlockReduce.x ), batchSzAligned / 2 );
ReduceKer<<<dimGridReduce, dimBlockReduce, 0, stream[batchIdx]>>>
( (u32*) d_intermediate[batchIdx],
(u32*) d_output[batchIdx],
outCount,
tileCount
);
// Copy Result to output buffer
for ( int i = 0; i < outCount; ++i ){
cudaErrchk( cudaMemcpyAsync( &((G*) outputBuf)[wordPerChunk * i + wordPerBatch * batchIdx],
&d_output[batchIdx][batchSzAligned * i],
batchSz * sizeof(GaloisCu16),
cudaMemcpyDeviceToHost,
stream[batchIdx] ) );
}
}
cudaErrchk( cudaDeviceSynchronize() );
for ( u32 i = 0; i < batchCount; ++i ) {
// Destroy stream and free device memory
cudaErrchk( cudaStreamDestroy( stream[i] ) );
cudaFree( d_input[i] );
cudaFree( d_intermediate[i] );
cudaFree( d_output[i] );
}
// Free memory
cudaFree( d_bases );
cudaFree( d_exponents );
delete[] stream;
delete[] d_input;
delete[] d_intermediate;
delete[] d_output;
}