Hello everyone. I’m developing a CUDA application for kernel convolution in 2D images. I’m actually using 3x3 kernels for my convolution.
In my kernel i make use of tiles and shared memory and the timing for the kernel itself is pretty good (like 0.1ms for an 10000x10000 image). The problem is that a lot of time is spent on CUDA malloc, copying datas from host to device and vice-versa. I tried to implement another version of my program with streams, but it doesn’t provide any benefit. Below my code. Any suggestion? Thanks for your help! FYI: My card is NVidia GeForce 1650 TI
int main() {
cv::Mat image = cv::imread(
R"(path_to_image)");
if (image.empty()) {
std::cerr << "ERROR: Image NOT found!" << std::endl;
return -1;
}
cv::Mat outputImage = image.clone();
int channels = 3;
int imageWidth = outputImage.cols;
int imageHeight = outputImage.rows;
printf(
"PLEASE CHOOSE A KERNEL TO BE USED:\n 0: Identity\n 1: Blur\n 2: Emboss\n 3: Sharpen\n 4: Outline\n 5: Bottom sobel\n 6: Ridge\n 7: Edge detection\n 8: Box Blur\n NOTE: If no valid input is provided, the IDENTITY kernel will be used!\n");
int choosenKernel;
std::cin >> choosenKernel;
float *kernel = getKernel(choosenKernel);
auto beginTime = std::chrono::high_resolution_clock::now();
/*<--------------------------------DEVICE MEM ALLOC-------------------------------->*/
cudaSetDevice(0);
size_t totalSize = imageWidth * imageHeight * channels * sizeof(unsigned char);
unsigned char *d_imageDatas, *d_outputImageDatas;
CUDA_CHECK(cudaMallocAsync(&d_imageDatas, totalSize, 0));
CUDA_CHECK(cudaMallocAsync(&d_outputImageDatas, totalSize, 0));
/*<--------------------------------HOST MEM ALLOC-------------------------------->*/
unsigned char *imageDatas = (unsigned char *) malloc(totalSize);
unsigned char *outputImageDatas = (unsigned char *) malloc(totalSize);
assignDatas(imageDatas, imageWidth, imageHeight, channels, &outputImage);
/*<--------------------------------DEVICE CONSTANT MEM SET-------------------------------->*/
CUDA_CHECK(cudaMemcpyToSymbol(KERNEL, kernel, MASK_WIDTH * MASK_HEIGHT * sizeof(float)));
CUDA_CHECK(cudaMemcpyToSymbol(CHANNELS, &channels, sizeof(int)));
CUDA_CHECK(cudaMemcpyToSymbol(IMG_WIDTH, &imageWidth, sizeof(int)));
CUDA_CHECK(cudaMemcpyToSymbol(IMG_HEIGHT, &imageHeight, sizeof(int)));
/*<--------------------------------MULTI-STREAM SETUP-------------------------------->*/
const int streamsNumber = 4;
cudaStream_t streams[streamsNumber];
for (int i = 0; i < streamsNumber; ++i) {
CUDA_CHECK(cudaStreamCreate(&streams[i]));
}
int rowsPerStream = imageHeight / streamsNumber;
int overlap = MASK_RADIUS_Y;
dim3 blockDim(TILE_WIDTH, TILE_WIDTH);
for (int i = 0; i < streamsNumber; ++i) {
int startRow = i * rowsPerStream - (i > 0 ? overlap : 0);
int numRows = rowsPerStream + (i > 0 ? overlap : 0) + overlap + 1;
if (startRow + numRows > imageHeight) {
numRows = imageHeight - startRow;
}
unsigned char *h_chunkSrc = imageDatas + startRow * imageWidth * channels;
unsigned char *h_chunkDst = outputImageDatas + (i * rowsPerStream) * imageWidth * channels;
unsigned char *d_chunkSrc = d_imageDatas + startRow * imageWidth * channels;
unsigned char *d_chunkDst = d_outputImageDatas + (i * rowsPerStream) * imageWidth * channels;
size_t chunkBytes = numRows * imageWidth * channels;
/* Asynchronous memory transfer (Host to Device) */
CUDA_CHECK(cudaMemcpyAsync(d_chunkSrc, h_chunkSrc, chunkBytes, cudaMemcpyHostToDevice, streams[i]));
/* Kernel launch */
dim3 gridDim((imageWidth + TILE_WIDTH - 1) / TILE_WIDTH, (numRows + TILE_WIDTH - 1) / TILE_WIDTH);
kernelConvolution<<<gridDim, blockDim, 0, streams[i]>>>(d_chunkSrc, d_chunkDst, startRow, numRows);
/* Asynchronous memory transfer (Device to Host) */
CUDA_CHECK(cudaMemcpyAsync(h_chunkDst, d_chunkDst, rowsPerStream * imageWidth * channels, cudaMemcpyDeviceToHost, streams[i]));
}
auto endTime = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> ms_double = endTime - beginTime;
/* Synchronize all streams */
for (int i = 0; i < streamsNumber; ++i) {
CUDA_CHECK(cudaStreamSynchronize(streams[i]));
CUDA_CHECK(cudaStreamDestroy(streams[i]));
}
printf("Duration ms: %f\n", ms_double.count());
/*<------------------------------FINAL IMAGE REBUILD------------------------------>*/
buildImage(outputImageDatas, imageWidth, imageHeight, channels, &outputImage);
/* Memory freeing */
free(imageDatas);
free(outputImageDatas);
CUDA_CHECK(cudaFree(d_imageDatas));
CUDA_CHECK(cudaFree(d_outputImageDatas));
cv::imshow("Original Image", image);
cv::imshow("Output Image", outputImage);
cv::waitKey(0);
return 0;
}