Hi, I’m trying to improve the runtime of my cuda function, which basically does pixel block alignments for a distorted image based on a given reference image. In order to make cudaMemcpy run faster, I switched from float to uint8_t for the data. I tested on Google Colab with a T4 GPU and the result seems normal (a lot faster cudaMemcpy time and slight improvement for the kernel runtime). When I tested locally with a NVIDIA RTX A4000, the cudaMemcpy instruction is a lot faster as expected, but somehow the kernel takes longer to execute (from 3ms to 5ms). what could be the reason? below is my code and I tested using a 4k resolution image.
Using float:
__global__
void correct_image_kernel(
const float* d_distorted,
const float* d_reference,
float* d_corrected,
int* d_offset_map,
int h,
int w){
extern __shared__ ErrorPos shared_data[];
__shared__ float shared_distorted[(block_size+search_size)*(block_size+search_size)];
int half_search = search_size/2;
int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
int sharedSize = block_size + search_size;
// Calculate global coordinates of the block
int block_i = blockIdx.y;
int block_j = blockIdx.x;
int i = block_i*block_size;
int j = block_j*block_size;
// load reference block
__shared__ float ref_block[block_size*block_size];
for ( int loadI = threadIdx.y; loadI < sharedSize; loadI += blockDim.y ){
for ( int loadJ = threadIdx.x; loadJ < sharedSize; loadJ += blockDim.x ){
int globalI = loadI + blockIdx.y * block_size - half_search;
int globalJ = loadJ + blockIdx.x * block_size - half_search;
if ( globalI < h && globalJ < w && globalI >= 0 && globalJ >= 0 ){
shared_distorted[ loadI * sharedSize + loadJ ] = d_distorted[ globalI * w + globalJ];
}else{
shared_distorted[ loadI * sharedSize + loadJ ] = 0.0f;
}
if ( loadI < block_size && loadJ < block_size ){
ref_block[ loadI * block_size + loadJ ] = d_reference[ ( i + loadI ) * w + ( j + loadJ ) ];
}
}
}
__syncthreads();
float error = 1e10;
error = 0.0f;
for (int di = 0; di < block_size; ++di){
for (int dj = 0; dj < block_size; ++dj){
float diff = ref_block[di*block_size+dj]-shared_distorted[(threadIdx.y+di)*sharedSize+(threadIdx.x+dj)];
error += diff * diff;
}
}
error = error / (block_size * block_size);
// store current positions and errors
shared_data[thread_id].error = error;
shared_data[thread_id].si = threadIdx.y;
shared_data[thread_id].sj = threadIdx.x;
__syncthreads();
// reduction
for (int stride = blockDim.x*blockDim.y/2; stride>0; stride/=2){
if (thread_id < stride) {
if (shared_data[thread_id+stride].error < shared_data[thread_id].error){
shared_data[thread_id] = shared_data[thread_id+stride];
}
}
__syncthreads();
}
// 1st thread in the block writes the best match
if (thread_id == 0){
int best_di = shared_data[0].si;
int best_dj = shared_data[0].sj;
d_offset_map[block_i * (w / block_size) * 2 + block_j * 2] = best_di;
d_offset_map[block_i * (w / block_size) * 2 + block_j * 2 + 1] = best_dj;
// Copy the best fit block to the current position
for (int di = 0; di < block_size; ++di){
for (int dj = 0; dj < block_size; ++dj){
d_corrected[(i + di) * w + (j + dj)] = shared_distorted[(best_di + di) * sharedSize + (best_dj + dj)];
}
}
}
}
Using uint8_t:
__global__
void correct_image_kernel(
const uint8_t* d_distorted,
const uint8_t* d_reference,
uint8_t* d_corrected,
int* d_offset_map,
int h,
int w
)
{
extern __shared__ ErrorPos shared_data[];
__shared__ uint8_t shared_distorted[(block_size+search_size)*(block_size+search_size)];
int half_search = search_size/2;
int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
int sharedSize = block_size + search_size;
// Calculate global coordinates of the block
int block_i = blockIdx.y;
int block_j = blockIdx.x;
int i = block_i*block_size;
int j = block_j*block_size;
// load reference block
__shared__ uint8_t ref_block[block_size*block_size];
for ( int loadI = threadIdx.y; loadI < sharedSize; loadI += blockDim.y ){
for ( int loadJ = threadIdx.x; loadJ < sharedSize; loadJ += blockDim.x ){
int globalI = loadI + blockIdx.y * block_size - half_search;
int globalJ = loadJ + blockIdx.x * block_size - half_search;
if ( globalI < h && globalJ < w && globalI >= 0 && globalJ >= 0 ){
shared_distorted[ loadI * sharedSize + loadJ ] = d_distorted[ globalI * w + globalJ] ;
}else{
shared_distorted[ loadI * sharedSize + loadJ ] = 0;
}
if ( loadI < block_size && loadJ < block_size ){
ref_block[ loadI * block_size + loadJ ] = d_reference[ ( i + loadI ) * w + ( j + loadJ ) ];
}
}
}
__syncthreads();
int error = 0;
for (int di = 0; di < block_size; ++di){
for (int dj = 0; dj < block_size; ++dj){
int diff = ref_block[di*block_size+dj]-shared_distorted[(threadIdx.y+di)*sharedSize+(threadIdx.x+dj)];
error += diff * diff;
}
}
float mse = static_cast<float>( error ) / (block_size * block_size);
// store current positions and errors
shared_data[thread_id].error = mse;
shared_data[thread_id].si = threadIdx.y;
shared_data[thread_id].sj = threadIdx.x;
__syncthreads();
// reduction
for (int stride = blockDim.x*blockDim.y/2; stride>0; stride/=2){
if (thread_id < stride) {
if (shared_data[thread_id+stride].error < shared_data[thread_id].error){
shared_data[thread_id] = shared_data[thread_id+stride];
}
}
__syncthreads();
}
// 1st thread in the block writes the best match
if (thread_id == 0){
int best_di = shared_data[0].si;
int best_dj = shared_data[0].sj;
d_offset_map[block_i * (w / block_size) * 2 + block_j * 2] = best_di;
d_offset_map[block_i * (w / block_size) * 2 + block_j * 2 + 1] = best_dj;
// Copy the best fit block to the current position
for (int di = 0; di < block_size; ++di){
for (int dj = 0; dj < block_size; ++dj){
if ( i+di >=0 && i + di < h && j + dj >=0 && j + dj < w )
{
d_corrected[(i + di) * w + (j + dj)] = shared_distorted[(best_di + di) * sharedSize + (best_dj + dj)];
}
}
}
}
}
Here is the main function and structs I used just in case:
#include <cuda_runtime.h>
#include <iostream>
#include <opencv2/opencv.hpp>
using namespace cv;
using namespace std;
const int block_size = 15;
const int search_size = 10;
const string distorted_name = "../input/distorted_big.jpg";
const string reference_name = "../input/4096x4096.png";
//for error and position
struct ErrorPos{
float error;
int si, sj;
};
int main(){
cudaFree(0);
auto pin1 = std::chrono::high_resolution_clock::now();
Mat distorted = imread(distorted_name, IMREAD_GRAYSCALE);
Mat reference = imread(reference_name, IMREAD_GRAYSCALE);
if (distorted.empty() || reference.empty() || distorted.rows != reference.rows || distorted.cols != reference.cols){
cerr << "Could not open or find the images or 2 images have mismatched size" << endl;
return -1;
}
auto temp_pin = std::chrono::high_resolution_clock::now();
//timing
auto pin2 = std::chrono::high_resolution_clock::now();
int h = distorted.rows;
int w = distorted.cols;
cout << h << " " << w << endl;
size_t img_size = h*w*sizeof(uint8_t);
size_t map_size = (h/block_size)*(w/block_size)*2*sizeof(int);
uint8_t* d_distorted;
uint8_t* d_reference;
uint8_t* d_corrected;
int* d_offset_map;
cudaMalloc(&d_distorted, img_size);
cudaMalloc(&d_reference, img_size);
cudaMalloc(&d_corrected, img_size);
cudaMalloc(&d_offset_map, map_size);
cudaMemcpy(d_distorted, distorted.data, img_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_reference, reference.data, img_size, cudaMemcpyHostToDevice);
// Define CUDA grid and block size
dim3 block_dim(search_size, search_size);
dim3 grid_dim(w/block_size, h/block_size);
cout << "number of blocks: " << grid_dim.x << " " << grid_dim.y << endl;
cout << "number of threads: " << block_dim.x << " " << block_dim.y << endl;
// Timing
auto pin3 = std::chrono::high_resolution_clock::now();
// Define the size of dynamically shared memory
size_t shared_dim = (search_size*search_size)*sizeof(ErrorPos);
correct_image_kernel<<<grid_dim, block_dim, shared_dim>>>(d_distorted, d_reference, d_corrected, d_offset_map, h, w);
cudaDeviceSynchronize();
auto pin4 = std::chrono::high_resolution_clock::now();
cudaMemcpy(distorted.data, d_corrected, img_size, cudaMemcpyDeviceToHost);
auto pin5 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin2 - pin1).count();
cout << "image read time: " << duration << " milliseconds" << std::endl;
duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin3 - pin2).count();
cout << "memcpy hostToDevice time: " << duration << " milliseconds" << std::endl;
duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin4 - pin3).count();
cout << "kernel run time: " << duration << " milliseconds" << std::endl;
duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin5 - pin4).count();
cout << "memcpy deviceToHost time: " << duration << " milliseconds" << std::endl;
duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin5 - pin2).count();
cout << "total run time(excluding image loading): " << duration << " milliseconds" << std::endl;
duration = std::chrono::duration_cast<std::chrono::milliseconds>(pin2 - temp_pin).count();
cout << "convert time: " << duration << " milliseconds" << std::endl;
//cv::normalize(distorted, distorted, 0, 255, cv::NORM_MINMAX);
imwrite("corrected_2.jpg", distorted);
// Free memory
cudaFree(d_distorted);
cudaFree(d_reference);
cudaFree(d_corrected);
cudaFree(d_offset_map);
return 0;
}