CUDA kernel runs slower when I changed from float to uint8_t

z122095554 · July 30, 2024, 12:36am

Hi, I’m trying to improve the runtime of my cuda function, which basically does pixel block alignments for a distorted image based on a given reference image. In order to make cudaMemcpy run faster, I switched from float to uint8_t for the data. I tested on Google Colab with a T4 GPU and the result seems normal (a lot faster cudaMemcpy time and slight improvement for the kernel runtime). When I tested locally with a NVIDIA RTX A4000, the cudaMemcpy instruction is a lot faster as expected, but somehow the kernel takes longer to execute (from 3ms to 5ms). what could be the reason? below is my code and I tested using a 4k resolution image.
Using float:

__global__ 
void correct_image_kernel(
    const float* d_distorted, 
    const float* d_reference, 
    float* d_corrected, 
    int* d_offset_map, 
    int h, 
    int w){
    extern __shared__ ErrorPos shared_data[];
    __shared__ float shared_distorted[(block_size+search_size)*(block_size+search_size)];

    int half_search = search_size/2;
    int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
    int sharedSize = block_size + search_size;

    // Calculate global coordinates of the block
    int block_i = blockIdx.y;
    int block_j = blockIdx.x;
    int i = block_i*block_size;
    int j = block_j*block_size;

    // load reference block
    __shared__ float ref_block[block_size*block_size];
    
    for ( int loadI = threadIdx.y; loadI < sharedSize; loadI += blockDim.y ){
        for ( int loadJ = threadIdx.x; loadJ < sharedSize; loadJ += blockDim.x ){
            int globalI = loadI + blockIdx.y * block_size - half_search;
            int globalJ = loadJ + blockIdx.x * block_size - half_search;
            if ( globalI < h && globalJ < w && globalI >= 0 && globalJ >= 0 ){
                shared_distorted[ loadI * sharedSize + loadJ ] = d_distorted[ globalI * w + globalJ];
            }else{
                shared_distorted[ loadI * sharedSize + loadJ ] = 0.0f;
            }

            if ( loadI < block_size && loadJ < block_size ){
                ref_block[ loadI * block_size + loadJ ] = d_reference[ ( i + loadI ) * w + ( j + loadJ ) ];
            }
        }
    }
    __syncthreads();

    float error = 1e10;
    error = 0.0f;
    for (int di = 0; di < block_size; ++di){
        for (int dj = 0; dj < block_size; ++dj){
            float diff = ref_block[di*block_size+dj]-shared_distorted[(threadIdx.y+di)*sharedSize+(threadIdx.x+dj)];
            error += diff * diff;
        }
    }
    error = error / (block_size * block_size);

    // store current positions and errors
    shared_data[thread_id].error = error;
    shared_data[thread_id].si = threadIdx.y;
    shared_data[thread_id].sj = threadIdx.x;
    __syncthreads();

    // reduction
    for (int stride = blockDim.x*blockDim.y/2; stride>0; stride/=2){
        if (thread_id < stride) {
            if (shared_data[thread_id+stride].error < shared_data[thread_id].error){
                shared_data[thread_id] = shared_data[thread_id+stride];
            }
        }
        __syncthreads();
    }

    // 1st thread in the block writes the best match
    if (thread_id == 0){
        int best_di = shared_data[0].si;
        int best_dj = shared_data[0].sj;
        d_offset_map[block_i * (w / block_size) * 2 + block_j * 2] = best_di;
        d_offset_map[block_i * (w / block_size) * 2 + block_j * 2 + 1] = best_dj;

        // Copy the best fit block to the current position
        for (int di = 0; di < block_size; ++di){
            for (int dj = 0; dj < block_size; ++dj){
                d_corrected[(i + di) * w + (j + dj)] = shared_distorted[(best_di + di) * sharedSize + (best_dj + dj)];
            }
        }
    }
}

Using uint8_t:

__global__ 
void correct_image_kernel(
    const uint8_t* d_distorted, 
    const uint8_t* d_reference, 
    uint8_t* d_corrected, 
    int* d_offset_map, 
    int h, 
    int w
)
{
    extern __shared__ ErrorPos shared_data[];
    __shared__ uint8_t shared_distorted[(block_size+search_size)*(block_size+search_size)];

    int half_search = search_size/2;
    int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
    int sharedSize = block_size + search_size;

    // Calculate global coordinates of the block
    int block_i = blockIdx.y;
    int block_j = blockIdx.x;
    int i = block_i*block_size;
    int j = block_j*block_size;

    // load reference block
    __shared__ uint8_t ref_block[block_size*block_size];
    
    for ( int loadI = threadIdx.y; loadI < sharedSize; loadI += blockDim.y ){
        for ( int loadJ = threadIdx.x; loadJ < sharedSize; loadJ += blockDim.x ){
            int globalI = loadI + blockIdx.y * block_size - half_search;
            int globalJ = loadJ + blockIdx.x * block_size - half_search;
            if ( globalI < h && globalJ < w && globalI >= 0 && globalJ >= 0 ){
                shared_distorted[ loadI * sharedSize + loadJ ] = d_distorted[ globalI * w + globalJ] ;
            }else{
                shared_distorted[ loadI * sharedSize + loadJ ] = 0;
            }

            if ( loadI < block_size && loadJ < block_size ){
                ref_block[ loadI * block_size + loadJ ] = d_reference[ ( i + loadI ) * w + ( j + loadJ ) ];
            }
        }
    }
    __syncthreads();

    int error = 0;
    for (int di = 0; di < block_size; ++di){
        for (int dj = 0; dj < block_size; ++dj){
            int diff = ref_block[di*block_size+dj]-shared_distorted[(threadIdx.y+di)*sharedSize+(threadIdx.x+dj)];
            error += diff * diff;
        }
    }
    float mse = static_cast<float>( error ) / (block_size * block_size);

    // store current positions and errors
    shared_data[thread_id].error = mse;
    shared_data[thread_id].si = threadIdx.y;
    shared_data[thread_id].sj = threadIdx.x;
    __syncthreads();

    // reduction
    for (int stride = blockDim.x*blockDim.y/2; stride>0; stride/=2){
        if (thread_id < stride) {
            if (shared_data[thread_id+stride].error < shared_data[thread_id].error){
                shared_data[thread_id] = shared_data[thread_id+stride];
            }
        }
        __syncthreads();
    }

    // 1st thread in the block writes the best match
    if (thread_id == 0){
        int best_di = shared_data[0].si;
        int best_dj = shared_data[0].sj;
        d_offset_map[block_i * (w / block_size) * 2 + block_j * 2] = best_di;
        d_offset_map[block_i * (w / block_size) * 2 + block_j * 2 + 1] = best_dj;

        // Copy the best fit block to the current position
        for (int di = 0; di < block_size; ++di){
            for (int dj = 0; dj < block_size; ++dj){
                if ( i+di >=0 && i + di < h && j + dj >=0 && j + dj < w )
                {
                    d_corrected[(i + di) * w + (j + dj)] = shared_distorted[(best_di + di) * sharedSize + (best_dj + dj)];
                }
            }
        }
    }
}

Here is the main function and structs I used just in case:

#include <cuda_runtime.h>
#include <iostream>
#include <opencv2/opencv.hpp>

using namespace cv;
using namespace std;

const int block_size = 15;
const int search_size = 10;
const string distorted_name = "../input/distorted_big.jpg";
const string reference_name = "../input/4096x4096.png";

//for error and position
struct ErrorPos{
    float error;
    int si, sj;
};

int main(){

    cudaFree(0);

    auto pin1 = std::chrono::high_resolution_clock::now();

    Mat distorted = imread(distorted_name, IMREAD_GRAYSCALE);
    Mat reference = imread(reference_name, IMREAD_GRAYSCALE);
    if (distorted.empty() || reference.empty() || distorted.rows != reference.rows || distorted.cols != reference.cols){
        cerr << "Could not open or find the images or 2 images have mismatched size" << endl;
        return -1;
    }
    auto temp_pin = std::chrono::high_resolution_clock::now();

    //timing
    auto pin2 = std::chrono::high_resolution_clock::now();

    int h = distorted.rows;
    int w = distorted.cols;
    cout << h << " " << w << endl;
    size_t img_size = h*w*sizeof(uint8_t);
    size_t map_size = (h/block_size)*(w/block_size)*2*sizeof(int);

    uint8_t* d_distorted;
    uint8_t* d_reference;
    uint8_t* d_corrected;
    int* d_offset_map;

    cudaMalloc(&d_distorted, img_size);
    cudaMalloc(&d_reference, img_size);
    cudaMalloc(&d_corrected, img_size);
    cudaMalloc(&d_offset_map, map_size);

    cudaMemcpy(d_distorted, distorted.data, img_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_reference, reference.data, img_size, cudaMemcpyHostToDevice);

    // Define CUDA grid and block size
    dim3 block_dim(search_size, search_size);
    dim3 grid_dim(w/block_size, h/block_size);
    cout << "number of blocks: " << grid_dim.x << " " << grid_dim.y << endl;
    cout << "number of threads: " << block_dim.x << " " << block_dim.y << endl;

    // Timing
    auto pin3 = std::chrono::high_resolution_clock::now();

    // Define the size of dynamically shared memory
    size_t shared_dim = (search_size*search_size)*sizeof(ErrorPos);
    correct_image_kernel<<<grid_dim, block_dim, shared_dim>>>(d_distorted, d_reference, d_corrected, d_offset_map, h, w);

    cudaDeviceSynchronize();
    auto pin4 = std::chrono::high_resolution_clock::now();

    cudaMemcpy(distorted.data, d_corrected, img_size, cudaMemcpyDeviceToHost);

    auto pin5 = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin2 - pin1).count();
    cout << "image read time: " << duration << " milliseconds" << std::endl;
    duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin3 - pin2).count();
    cout << "memcpy hostToDevice time: " << duration << " milliseconds" << std::endl;
    duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin4 - pin3).count();
    cout << "kernel run time: " << duration << " milliseconds" << std::endl;
    duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin5 - pin4).count();
    cout << "memcpy deviceToHost time: " << duration << " milliseconds" << std::endl;
    duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin5 - pin2).count();
    cout << "total run time(excluding image loading): " << duration << " milliseconds" << std::endl;
    duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin2 - temp_pin).count();
    cout << "convert time: " << duration << " milliseconds" << std::endl;

    //cv::normalize(distorted, distorted, 0, 255, cv::NORM_MINMAX);
    imwrite("corrected_2.jpg", distorted);

    // Free memory
    cudaFree(d_distorted);
    cudaFree(d_reference);
    cudaFree(d_corrected);
    cudaFree(d_offset_map);

    return 0;
}

njuffa · July 30, 2024, 12:56am

Make sure you are looking at release builds with full optimizations. GPUs are basically optimized for 32-bit data. Access to 8-bit data will often incur additional overhead (e.g. in the form of type-conversion instructions), although an expansion of run time from 3 to 5 milliseconds is a larger difference than I would expect.

If this were my code, I would first examine the differences in the generated machine code (SASS) between the two variants. If there is nothing in the code that could plausibly explain the performance difference, I would carefully compare the profiling statistics for the two kernel variants.

For image processing, it is usually best to combine byte-sized pixels and use the uchar4 type to grab four pixels in one 32-bit access, possibly processing these using CUDA’s SIMD intrinsics for further efficiency.

z122095554 · July 30, 2024, 1:11am

ok i changed the arrays in shared memory to be float type and used static_cast to change uint8_t to float when populating the shared memory. now it’s fast as expected!

system · August 13, 2024, 1:11am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Converting a kernel from floats and ints to halfs is 6x slower CUDA Programming and Performance cuda	14	1035	October 16, 2023
[Beginner]: CUDA slower than serial implementation fill Operation on entire image CUDA Programming and Performance	18	13519	September 15, 2011
Re_arranging Cuda array CUDA Programming and Performance	8	51	September 23, 2024
CUDA image processing Accelaration tips anyone? CUDA Programming and Performance	20	6063	November 16, 2010
Optimizing CUDA kernel with atomics CUDA Programming and Performance	10	310	August 19, 2024
How to use more efficiently the shared memory and 2D tiles CUDA Programming and Performance	6	1702	May 18, 2023
Performance issues on memory transfer CUDA Programming and Performance	13	12982	November 26, 2010
Cuda Latency problems Slow Cuda CUDA Programming and Performance	15	13935	September 5, 2008
When i use CUDA, my program gets slower CUDA Programming and Performance	5	533	April 2, 2019
GPU/CPU precision comparison and Kernel instructions question CUDA Programming and Performance	5	680	April 4, 2017

CUDA kernel runs slower when I changed from float to uint8_t

Related topics