CUDA kernel runs slower when I changed from float to uint8_t

Hi, I’m trying to improve the runtime of my cuda function, which basically does pixel block alignments for a distorted image based on a given reference image. In order to make cudaMemcpy run faster, I switched from float to uint8_t for the data. I tested on Google Colab with a T4 GPU and the result seems normal (a lot faster cudaMemcpy time and slight improvement for the kernel runtime). When I tested locally with a NVIDIA RTX A4000, the cudaMemcpy instruction is a lot faster as expected, but somehow the kernel takes longer to execute (from 3ms to 5ms). what could be the reason? below is my code and I tested using a 4k resolution image.
Using float:

__global__ 
void correct_image_kernel(
    const float* d_distorted, 
    const float* d_reference, 
    float* d_corrected, 
    int* d_offset_map, 
    int h, 
    int w){
    extern __shared__ ErrorPos shared_data[];
    __shared__ float shared_distorted[(block_size+search_size)*(block_size+search_size)];

    int half_search = search_size/2;
    int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
    int sharedSize = block_size + search_size;

    // Calculate global coordinates of the block
    int block_i = blockIdx.y;
    int block_j = blockIdx.x;
    int i = block_i*block_size;
    int j = block_j*block_size;

    // load reference block
    __shared__ float ref_block[block_size*block_size];
    
    for ( int loadI = threadIdx.y; loadI < sharedSize; loadI += blockDim.y ){
        for ( int loadJ = threadIdx.x; loadJ < sharedSize; loadJ += blockDim.x ){
            int globalI = loadI + blockIdx.y * block_size - half_search;
            int globalJ = loadJ + blockIdx.x * block_size - half_search;
            if ( globalI < h && globalJ < w && globalI >= 0 && globalJ >= 0 ){
                shared_distorted[ loadI * sharedSize + loadJ ] = d_distorted[ globalI * w + globalJ];
            }else{
                shared_distorted[ loadI * sharedSize + loadJ ] = 0.0f;
            }

            if ( loadI < block_size && loadJ < block_size ){
                ref_block[ loadI * block_size + loadJ ] = d_reference[ ( i + loadI ) * w + ( j + loadJ ) ];
            }
        }
    }
    __syncthreads();

    float error = 1e10;
    error = 0.0f;
    for (int di = 0; di < block_size; ++di){
        for (int dj = 0; dj < block_size; ++dj){
            float diff = ref_block[di*block_size+dj]-shared_distorted[(threadIdx.y+di)*sharedSize+(threadIdx.x+dj)];
            error += diff * diff;
        }
    }
    error = error / (block_size * block_size);

    // store current positions and errors
    shared_data[thread_id].error = error;
    shared_data[thread_id].si = threadIdx.y;
    shared_data[thread_id].sj = threadIdx.x;
    __syncthreads();

    // reduction
    for (int stride = blockDim.x*blockDim.y/2; stride>0; stride/=2){
        if (thread_id < stride) {
            if (shared_data[thread_id+stride].error < shared_data[thread_id].error){
                shared_data[thread_id] = shared_data[thread_id+stride];
            }
        }
        __syncthreads();
    }

    // 1st thread in the block writes the best match
    if (thread_id == 0){
        int best_di = shared_data[0].si;
        int best_dj = shared_data[0].sj;
        d_offset_map[block_i * (w / block_size) * 2 + block_j * 2] = best_di;
        d_offset_map[block_i * (w / block_size) * 2 + block_j * 2 + 1] = best_dj;

        // Copy the best fit block to the current position
        for (int di = 0; di < block_size; ++di){
            for (int dj = 0; dj < block_size; ++dj){
                d_corrected[(i + di) * w + (j + dj)] = shared_distorted[(best_di + di) * sharedSize + (best_dj + dj)];
            }
        }
    }
}

Using uint8_t:

__global__ 
void correct_image_kernel(
    const uint8_t* d_distorted, 
    const uint8_t* d_reference, 
    uint8_t* d_corrected, 
    int* d_offset_map, 
    int h, 
    int w
)
{
    extern __shared__ ErrorPos shared_data[];
    __shared__ uint8_t shared_distorted[(block_size+search_size)*(block_size+search_size)];

    int half_search = search_size/2;
    int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
    int sharedSize = block_size + search_size;

    // Calculate global coordinates of the block
    int block_i = blockIdx.y;
    int block_j = blockIdx.x;
    int i = block_i*block_size;
    int j = block_j*block_size;

    // load reference block
    __shared__ uint8_t ref_block[block_size*block_size];
    
    for ( int loadI = threadIdx.y; loadI < sharedSize; loadI += blockDim.y ){
        for ( int loadJ = threadIdx.x; loadJ < sharedSize; loadJ += blockDim.x ){
            int globalI = loadI + blockIdx.y * block_size - half_search;
            int globalJ = loadJ + blockIdx.x * block_size - half_search;
            if ( globalI < h && globalJ < w && globalI >= 0 && globalJ >= 0 ){
                shared_distorted[ loadI * sharedSize + loadJ ] = d_distorted[ globalI * w + globalJ] ;
            }else{
                shared_distorted[ loadI * sharedSize + loadJ ] = 0;
            }

            if ( loadI < block_size && loadJ < block_size ){
                ref_block[ loadI * block_size + loadJ ] = d_reference[ ( i + loadI ) * w + ( j + loadJ ) ];
            }
        }
    }
    __syncthreads();

    int error = 0;
    for (int di = 0; di < block_size; ++di){
        for (int dj = 0; dj < block_size; ++dj){
            int diff = ref_block[di*block_size+dj]-shared_distorted[(threadIdx.y+di)*sharedSize+(threadIdx.x+dj)];
            error += diff * diff;
        }
    }
    float mse = static_cast<float>( error ) / (block_size * block_size);

    // store current positions and errors
    shared_data[thread_id].error = mse;
    shared_data[thread_id].si = threadIdx.y;
    shared_data[thread_id].sj = threadIdx.x;
    __syncthreads();

    // reduction
    for (int stride = blockDim.x*blockDim.y/2; stride>0; stride/=2){
        if (thread_id < stride) {
            if (shared_data[thread_id+stride].error < shared_data[thread_id].error){
                shared_data[thread_id] = shared_data[thread_id+stride];
            }
        }
        __syncthreads();
    }

    // 1st thread in the block writes the best match
    if (thread_id == 0){
        int best_di = shared_data[0].si;
        int best_dj = shared_data[0].sj;
        d_offset_map[block_i * (w / block_size) * 2 + block_j * 2] = best_di;
        d_offset_map[block_i * (w / block_size) * 2 + block_j * 2 + 1] = best_dj;

        // Copy the best fit block to the current position
        for (int di = 0; di < block_size; ++di){
            for (int dj = 0; dj < block_size; ++dj){
                if ( i+di >=0 && i + di < h && j + dj >=0 && j + dj < w )
                {
                    d_corrected[(i + di) * w + (j + dj)] = shared_distorted[(best_di + di) * sharedSize + (best_dj + dj)];
                }
            }
        }
    }
}

Here is the main function and structs I used just in case:

#include <cuda_runtime.h>
#include <iostream>
#include <opencv2/opencv.hpp>

using namespace cv;
using namespace std;

const int block_size = 15;
const int search_size = 10;
const string distorted_name = "../input/distorted_big.jpg";
const string reference_name = "../input/4096x4096.png";

//for error and position
struct ErrorPos{
    float error;
    int si, sj;
};

int main(){

    cudaFree(0);

    auto pin1 = std::chrono::high_resolution_clock::now();

    Mat distorted = imread(distorted_name, IMREAD_GRAYSCALE);
    Mat reference = imread(reference_name, IMREAD_GRAYSCALE);
    if (distorted.empty() || reference.empty() || distorted.rows != reference.rows || distorted.cols != reference.cols){
        cerr << "Could not open or find the images or 2 images have mismatched size" << endl;
        return -1;
    }
    auto temp_pin = std::chrono::high_resolution_clock::now();

    //timing
    auto pin2 = std::chrono::high_resolution_clock::now();

    int h = distorted.rows;
    int w = distorted.cols;
    cout << h << " " << w << endl;
    size_t img_size = h*w*sizeof(uint8_t);
    size_t map_size = (h/block_size)*(w/block_size)*2*sizeof(int);

    uint8_t* d_distorted;
    uint8_t* d_reference;
    uint8_t* d_corrected;
    int* d_offset_map;

    cudaMalloc(&d_distorted, img_size);
    cudaMalloc(&d_reference, img_size);
    cudaMalloc(&d_corrected, img_size);
    cudaMalloc(&d_offset_map, map_size);

    cudaMemcpy(d_distorted, distorted.data, img_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_reference, reference.data, img_size, cudaMemcpyHostToDevice);

    // Define CUDA grid and block size
    dim3 block_dim(search_size, search_size);
    dim3 grid_dim(w/block_size, h/block_size);
    cout << "number of blocks: " << grid_dim.x << " " << grid_dim.y << endl;
    cout << "number of threads: " << block_dim.x << " " << block_dim.y << endl;

    // Timing
    auto pin3 = std::chrono::high_resolution_clock::now();

    // Define the size of dynamically shared memory
    size_t shared_dim = (search_size*search_size)*sizeof(ErrorPos);
    correct_image_kernel<<<grid_dim, block_dim, shared_dim>>>(d_distorted, d_reference, d_corrected, d_offset_map, h, w);

    cudaDeviceSynchronize();
    auto pin4 = std::chrono::high_resolution_clock::now();

    cudaMemcpy(distorted.data, d_corrected, img_size, cudaMemcpyDeviceToHost);

    auto pin5 = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin2 - pin1).count();
    cout << "image read time: " << duration << " milliseconds" << std::endl;
    duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin3 - pin2).count();
    cout << "memcpy hostToDevice time: " << duration << " milliseconds" << std::endl;
    duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin4 - pin3).count();
    cout << "kernel run time: " << duration << " milliseconds" << std::endl;
    duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin5 - pin4).count();
    cout << "memcpy deviceToHost time: " << duration << " milliseconds" << std::endl;
    duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin5 - pin2).count();
    cout << "total run time(excluding image loading): " << duration << " milliseconds" << std::endl;
    duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin2 - temp_pin).count();
    cout << "convert time: " << duration << " milliseconds" << std::endl;

    //cv::normalize(distorted, distorted, 0, 255, cv::NORM_MINMAX);
    imwrite("corrected_2.jpg", distorted);

    // Free memory
    cudaFree(d_distorted);
    cudaFree(d_reference);
    cudaFree(d_corrected);
    cudaFree(d_offset_map);

    return 0;
}

Make sure you are looking at release builds with full optimizations. GPUs are basically optimized for 32-bit data. Access to 8-bit data will often incur additional overhead (e.g. in the form of type-conversion instructions), although an expansion of run time from 3 to 5 milliseconds is a larger difference than I would expect.

If this were my code, I would first examine the differences in the generated machine code (SASS) between the two variants. If there is nothing in the code that could plausibly explain the performance difference, I would carefully compare the profiling statistics for the two kernel variants.

For image processing, it is usually best to combine byte-sized pixels and use the uchar4 type to grab four pixels in one 32-bit access, possibly processing these using CUDA’s SIMD intrinsics for further efficiency.

1 Like

ok i changed the arrays in shared memory to be float type and used static_cast to change uint8_t to float when populating the shared memory. now it’s fast as expected!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.