Hi, I tried to use cudaMemcpy2DAsync for HostToDevice memcpy, but it made my program a lot slower somehow. I’m already using cudaMemcpy2DAsync for DeviceToHost memcpy to get the result back, and it runs just as fast as normal cudaMemcpy. But when I add the async HostToDevice part at the beginning, the whole program is a lot slower.
Basically I’m reading 2 large wide images (40960x4096 for example), and I’m “cropping” it horizontally into smaller tiles so that everytime I read and process a portion (4096x4096) of the images. “overlap” basically loads some extra data on the right side so the edge cases in the kernels are correct. The pixelMap basically has 2 values for each position, so it’s timed by 2.
I tried to time HostToDevice cudaMemcpy2DAsync and cudaMemcpy alone, and it’s like 20ms slower. Did I do something wrong? below is my cudaMemcpy code and the Async code:
int numStreams = 3;
cudaStream_t streams[numStreams];
for ( int i = 0; i < numStreams; ++i )
{
cudaStreamCreate( &streams[i] );
}
int curStream = 0;
auto pin1 = std::chrono::high_resolution_clock::now();
cudaMemcpy( d_image1, image1.data, imageSize * sizeof( uint8_t ), cudaMemcpyHostToDevice );
cudaMemcpy( d_image2, image2.data, imageSize * sizeof( uint8_t ), cudaMemcpyHostToDevice );
cudaMemcpy( d_pixelMap, pixelMap, mapSize * sizeof( int ), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
auto pin2 = std::chrono::high_resolution_clock::now();
for(int startX = 0; startX < imageWidth; startX += tileWidth )
{
int endX = min( startX + tileWidth, imageWidth );
Kernel1<<< ..., streams[curStream] >>>( ..., startX, endX );
Kernel2<<< ..., streams[curStream] >>>( ..., startX, endX );
Kernel3<<< ..., streams[curStream] >>>( ..., startX, endX );
...;
size_t widthInBytes = (endX - startX) * sizeof(uint8_t);
cudaMemcpy2DAsync(resultImage1.data + startX, imageWidth * sizeof(uint8_t),
d_result1 + startX, imageWidth * sizeof(uint8_t),
widthInBytes, imageHeight,
cudaMemcpyDeviceToHost, streams[curStream]
);
cudaMemcpy2DAsync(resultImage2.data + startX, imageWidth * sizeof(uint8_t),
d_result2 + startX, imageWidth * sizeof(uint8_t),
widthInBytes, imageHeight,
cudaMemcpyDeviceToHost, streams[curStream]
);
curStream = ( curStream + 1 ) % numStreams;
}
for( int i = 0; i < numStreams; ++i)
{
cudaStreamSynchronize(streams[i]);
}
Async HostToDevice:
for(int startX = 0; startX < imageWidth; startX += tileWidth )
{
int endX = min( startX + tileWidth, imageWidth );
int memcpyWidth = endX + overlap > imageWidth ? imageWidth - (startX + overlap) : tileWidth;
int memcpyStart = startX + overlap;
//if it's the first time, we start at 0 and copy extra memory for the overlap part
if(startX == 0)
{
memcpyWidth +=overlap;
memcpyStart = 0;
}
cudaMemcpy2DAsync( d_image1 + memcpyStart, imageWidth * sizeof(uint8_t),
image1.data + memcpyStart, imageWidth * sizeof(uint8_t),
memcpyWidth * sizeof(uint8_t), imageHeight,
cudaMemcpyHostToDevice, streams[curStream] );
cudaMemcpy2DAsync( d_image2 + memcpyStart, imageWidth * sizeof(uint8_t),
image2.data + memcpyStart, imageWidth * sizeof(uint8_t),
memcpyWidth * sizeof(uint8_t), imageHeight,
cudaMemcpyHostToDevice, streams[curStream] );
cudaMemcpy2DAsync( d_pixelMap + memcpyStart * 2, imageWidth * 2 * sizeof(int),
pixelMap + memcpyStart * 2, imageWidth * 2 * sizeof(int),
memcpyWidth * 2 * sizeof( int ), imageHeight,
cudaMemcpyHostToDevice, streams[curStream] );
Kernel1<<< ..., streams[curStream] >>>( ..., startX, endX );
Kernel2<<< ..., streams[curStream] >>>( ..., startX, endX );
Kernel3<<< ..., streams[curStream] >>>( ..., startX, endX );
...;
size_t widthInBytes = (endX - startX) * sizeof(uint8_t);
cudaMemcpy2DAsync(resultImage1.data + startX, imageWidth * sizeof(uint8_t),
d_result1 + startX, imageWidth * sizeof(uint8_t),
widthInBytes, imageHeight,
cudaMemcpyDeviceToHost, streams[curStream]
);
cudaMemcpy2DAsync(resultImage2.data + startX, imageWidth * sizeof(uint8_t),
d_result2 + startX, imageWidth * sizeof(uint8_t),
widthInBytes, imageHeight,
cudaMemcpyDeviceToHost, streams[curStream]
);
curStream = ( curStream + 1 ) % numStreams;
}
for( int i = 0; i < numStreams; ++i)
{
cudaStreamSynchronize(streams[i]);
}