Kernel Convolution with streams provides no benefit

Hello everyone. I’m developing a CUDA application for kernel convolution in 2D images. I’m actually using 3x3 kernels for my convolution.
In my kernel i make use of tiles and shared memory and the timing for the kernel itself is pretty good (like 0.1ms for an 10000x10000 image). The problem is that a lot of time is spent on CUDA malloc, copying datas from host to device and vice-versa. I tried to implement another version of my program with streams, but it doesn’t provide any benefit. Below my code. Any suggestion? Thanks for your help! FYI: My card is NVidia GeForce 1650 TI

int main() {
    cv::Mat image = cv::imread(
        R"(path_to_image)");
    if (image.empty()) {
        std::cerr << "ERROR: Image NOT found!" << std::endl;
        return -1;
    }

    cv::Mat outputImage = image.clone();
    int channels = 3;
    int imageWidth = outputImage.cols;
    int imageHeight = outputImage.rows;

    printf(
        "PLEASE CHOOSE A KERNEL TO BE USED:\n 0: Identity\n 1: Blur\n 2: Emboss\n 3: Sharpen\n 4: Outline\n 5: Bottom sobel\n 6: Ridge\n 7: Edge detection\n 8: Box Blur\n NOTE: If no valid input is provided, the IDENTITY kernel will be used!\n");
    int choosenKernel;
    std::cin >> choosenKernel;
    float *kernel = getKernel(choosenKernel);
    auto beginTime = std::chrono::high_resolution_clock::now();
    /*<--------------------------------DEVICE MEM ALLOC-------------------------------->*/
    cudaSetDevice(0);
    size_t totalSize = imageWidth * imageHeight * channels * sizeof(unsigned char);
    unsigned char *d_imageDatas, *d_outputImageDatas;
    CUDA_CHECK(cudaMallocAsync(&d_imageDatas, totalSize, 0));
    CUDA_CHECK(cudaMallocAsync(&d_outputImageDatas, totalSize, 0));

    /*<--------------------------------HOST MEM ALLOC-------------------------------->*/
    unsigned char *imageDatas = (unsigned char *) malloc(totalSize);
    unsigned char *outputImageDatas = (unsigned char *) malloc(totalSize);

    assignDatas(imageDatas, imageWidth, imageHeight, channels, &outputImage);

    /*<--------------------------------DEVICE CONSTANT MEM SET-------------------------------->*/
    CUDA_CHECK(cudaMemcpyToSymbol(KERNEL, kernel, MASK_WIDTH * MASK_HEIGHT * sizeof(float)));
    CUDA_CHECK(cudaMemcpyToSymbol(CHANNELS, &channels, sizeof(int)));
    CUDA_CHECK(cudaMemcpyToSymbol(IMG_WIDTH, &imageWidth, sizeof(int)));
    CUDA_CHECK(cudaMemcpyToSymbol(IMG_HEIGHT, &imageHeight, sizeof(int)));

    /*<--------------------------------MULTI-STREAM SETUP-------------------------------->*/
    const int streamsNumber = 4;
    cudaStream_t streams[streamsNumber];
    for (int i = 0; i < streamsNumber; ++i) {
        CUDA_CHECK(cudaStreamCreate(&streams[i]));
    }

    int rowsPerStream = imageHeight / streamsNumber;
    int overlap = MASK_RADIUS_Y;

    dim3 blockDim(TILE_WIDTH, TILE_WIDTH);

    for (int i = 0; i < streamsNumber; ++i) {
        int startRow = i * rowsPerStream - (i > 0 ? overlap : 0);
        int numRows = rowsPerStream + (i > 0 ? overlap : 0) + overlap + 1;

        if (startRow + numRows > imageHeight) {
            numRows = imageHeight - startRow;
        }

        unsigned char *h_chunkSrc = imageDatas + startRow * imageWidth * channels;
        unsigned char *h_chunkDst = outputImageDatas + (i * rowsPerStream) * imageWidth * channels;
        unsigned char *d_chunkSrc = d_imageDatas + startRow * imageWidth * channels;
        unsigned char *d_chunkDst = d_outputImageDatas + (i * rowsPerStream) * imageWidth * channels;

        size_t chunkBytes = numRows * imageWidth * channels;

        /* Asynchronous memory transfer (Host to Device) */
        CUDA_CHECK(cudaMemcpyAsync(d_chunkSrc, h_chunkSrc, chunkBytes, cudaMemcpyHostToDevice, streams[i]));

        /* Kernel launch */
        dim3 gridDim((imageWidth + TILE_WIDTH - 1) / TILE_WIDTH, (numRows + TILE_WIDTH - 1) / TILE_WIDTH);
        kernelConvolution<<<gridDim, blockDim, 0, streams[i]>>>(d_chunkSrc, d_chunkDst, startRow, numRows);

        /* Asynchronous memory transfer (Device to Host) */
        CUDA_CHECK(cudaMemcpyAsync(h_chunkDst, d_chunkDst, rowsPerStream * imageWidth * channels, cudaMemcpyDeviceToHost, streams[i]));
    }
    auto endTime = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> ms_double = endTime - beginTime;

    /* Synchronize all streams */
    for (int i = 0; i < streamsNumber; ++i) {
        CUDA_CHECK(cudaStreamSynchronize(streams[i]));
        CUDA_CHECK(cudaStreamDestroy(streams[i]));
    }

    printf("Duration ms: %f\n", ms_double.count());

    /*<------------------------------FINAL IMAGE REBUILD------------------------------>*/
    buildImage(outputImageDatas, imageWidth, imageHeight, channels, &outputImage);

    /* Memory freeing */
    free(imageDatas);
    free(outputImageDatas);
    CUDA_CHECK(cudaFree(d_imageDatas));
    CUDA_CHECK(cudaFree(d_outputImageDatas));


    cv::imshow("Original Image", image);
    cv::imshow("Output Image", outputImage);
    cv::waitKey(0);

    return 0;
}
unsigned char *imageDatas = (unsigned char *) malloc(totalSize);
unsigned char *outputImageDatas = (unsigned char *) malloc(totalSize);

Allocating those arrays with cudaMallocHost will place them in pinned memory which should improve the transfer rate between host and device. But more importantly, with pinned memory cudaMemcpyAsync will actually be non-blocking.

Hi striker159, thanks for your reply. I tried also with a pinned memory approach, but timing didn’t change that much. With a 20000x20000 image it gets around 800ms to be done. Of these 800ms, only 0.2ms is kernel processing and around 500ms are taken for cudaMallocHost. My mallocHost version is above. I’m wondering if it’s a normal thing or not and, in case, why it takes so long. Thanks!

int main() {
    cv::Mat image = cv::imread(
        R"(path)");
    if (image.empty()) {
        std::cerr << "ERROR: Image NOT found!" << std::endl;
        return -1;
    }

    cv::Mat outputImage = image.clone();
    int channels = 3;
    int imageWidth = outputImage.cols;
    int imageHeight = outputImage.rows;

    printf(
        "PLEASE CHOOSE A KERNEL TO BE USED:\n 0: Identity\n 1: Blur\n 2: Emboss\n 3: Sharpen\n 4: Outline\n 5: Bottom sobel\n 6: Ridge\n 7: Edge detection\n 8: Box Blur\n NOTE: If no valid input is provided, the IDENTITY kernel will be used!\n");
    int choosenKernel;
    std::cin >> choosenKernel;
    float *kernel = getKernel(choosenKernel);
    /*<--------------------------------DEVICE MEM ALLOC-------------------------------->*/    
    auto beginTOTAL = std::chrono::high_resolution_clock::now();
    cudaSetDevice(0);
    size_t totalSize = imageWidth * imageHeight * channels * sizeof(unsigned char);
    unsigned char *d_imageDatas, *d_outputImageDatas;

    auto beginMalloc = std::chrono::high_resolution_clock::now();
    CUDA_CHECK(cudaMalloc(&d_imageDatas, totalSize));
    CUDA_CHECK(cudaMalloc(&d_outputImageDatas, totalSize));
    auto endMalloc = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> ms_double_malloc = endMalloc - beginMalloc;


    /*<--------------------------------HOST MEM ALLOC (PINNED MEMORY)-------------------------------->*/   
    unsigned char *imageDatas, *outputImageDatas;

    auto beginMallocHOST = std::chrono::high_resolution_clock::now();

    CUDA_CHECK(cudaMallocHost(&imageDatas, totalSize)); // Pinned memory allocation
    CUDA_CHECK(cudaMallocHost(&outputImageDatas, totalSize)); // Pinned memory allocation

    auto endMallocHOST = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> ms_double_mallocHost = endMallocHOST - beginMallocHOST;

    assignDatas(imageDatas, imageWidth, imageHeight, channels, &outputImage);
    /*<--------------------------------DEVICE CONSTANT MEM SET-------------------------------->*/  
    CUDA_CHECK(cudaMemcpyToSymbol(KERNEL, kernel, MASK_WIDTH * MASK_HEIGHT * sizeof(float)));
    CUDA_CHECK(cudaMemcpyToSymbol(CHANNELS, &channels, sizeof(int)));
    CUDA_CHECK(cudaMemcpyToSymbol(IMG_WIDTH, &imageWidth, sizeof(int)));
    CUDA_CHECK(cudaMemcpyToSymbol(IMG_HEIGHT, &imageHeight, sizeof(int)));

    /*<--------------------------------MULTI-STREAM SETUP-------------------------------->*/
    const int streamsNumber = 4;
    cudaStream_t streams[streamsNumber];
#pragma unroll
    for (int i = 0; i < streamsNumber; ++i) {
        CUDA_CHECK(cudaStreamCreate(&streams[i]));
    }

    int rowsPerStream = imageHeight / streamsNumber;
    int overlap = MASK_RADIUS_Y;

    dim3 blockDim(TILE_WIDTH, TILE_WIDTH);

    for (int i = 0; i < streamsNumber; ++i) {                                        
        int startRow = i * rowsPerStream - (i > 0 ? overlap : 0);
        int numRows = rowsPerStream + (i > 0 ? overlap : 0) + overlap + 1;

        if (startRow + numRows > imageHeight) {
            numRows = imageHeight - startRow;
        }

        unsigned char *h_chunkSrc = imageDatas + startRow * imageWidth * channels;
        unsigned char *h_chunkDst = outputImageDatas + (i * rowsPerStream) * imageWidth * channels;
        unsigned char *d_chunkSrc = d_imageDatas + startRow * imageWidth * channels;
        unsigned char *d_chunkDst = d_outputImageDatas + (i * rowsPerStream) * imageWidth * channels;

        size_t chunkBytes = numRows * imageWidth * channels;

        /* Asynchronous memory transfer (Host to Device) */
        CUDA_CHECK(cudaMemcpyAsync(d_chunkSrc, h_chunkSrc, chunkBytes, cudaMemcpyHostToDevice, streams[i]));

        /* Kernel launch */
        dim3 gridDim((imageWidth + TILE_WIDTH - 1) / TILE_WIDTH, (numRows + TILE_WIDTH - 1) / TILE_WIDTH);

        kernelConvolution<<<gridDim, blockDim, 0, streams[i]>>>(d_chunkSrc, d_chunkDst, startRow, numRows);

        /* Asynchronous memory transfer (Device to Host) */
        CUDA_CHECK(
            cudaMemcpyAsync(h_chunkDst, d_chunkDst, rowsPerStream * imageWidth * channels, cudaMemcpyDeviceToHost,
                streams[i]));
    }
....rest of the code....

Profilers are excellent tools to answer such questions. Have you tried Nsight Compute? You will likely notice a fairly steep learning curve. In my experience, that is normal for profilers, but some of the skills acquired will be applicable to all tools of this nature and therefore portable (I cut my teeth on early versions of Borland’s Turbo Profiler and Intel’s VTune in the 1990s).

Have you calculated the theoretical optimal transfer time over PCIe for your image sizes?

Ways to improve speed:

First, distinguish between latency (time from starting to copy the image to finishing copying back the result) and bandwidth (if you have multiple frames). You can overlap copying and computation improving bandwidth, but not latency.

Is it possible to receive the image in device memory or to use it further in device memory? E.g. if you want to display it, you can use directly display an OpenGL texture without copying back to host RAM.

Perhaps you can compress the image to shorten the time for copying.

Use multiple GPUs and compute parts of the image on each GPU.

Use a GPU and system with PCIe 5.0 (there are only few models) for highest speed. Make sure 16 lanes are used.