Dear Community,
I calculate the Gaussian filter on the jetson nano,the image size is 1920*1200 , and i use opencv firstly,it comput by cpu. then i use cuda, but CUDA takes more time then opencv-cpu. i can’t understand,why cpu is faster then gpu。
Here is the opencv-cpu code:
cv::Mat smooth_mat(d_image_height_, d_image_width_, CV_8UC1, pattern_ptr);
cv::GaussianBlur(smooth_mat, smooth_mat, cv::Size(5, 5), 1, 1);
Here is the CUDA code:
void main(void )
{
dim3 threadsPerBlock(8, 8);
dim3 blocksPerGrid((d_image_width_ + threadsPerBlock.x - 1) / threadsPerBlock.x,(d_image_height_ + threadsPerBlock.y - 1) / threadsPerBlock.y);
cudaMallocManaged((void **)&d_patterns_list_[i], d_image_height_ * d_image_width_ * sizeof(unsigned char));
// Some fill data code is omitted here...
kernel_gaussian_blur <<< blocksPerGrid, threadsPerBlock >>> (d_patterns_list_[serial_flag], d_patterns_list_[serial_flag], d_image_height_, d_image_width_, gauss_filter_width);
cudaDeviceSynchronize();
}
__global__ void kernel_gaussian_blur(const uchar* src, uchar* dst, int height, int width, int filterWidth)
{
int y = blockDim.y * blockIdx.y + threadIdx.y; //X
int x = blockDim.x * blockIdx.x + threadIdx.x; //y
int ind = y * width + x;
if (y >= height || x >= width)
{
return;
}
float color = 0.0f;
for (int i = 0; i < filterWidth; i++)
{
for (int j = 0; j < filterWidth; j++)
{
int clamp_x = min(max(x + j - filterWidth / 2, 0), width - 1);
int clamp_y = min(max(y + i - filterWidth / 2, 0), height - 1);
// float avg = d_const_Gaussian_5_5[i * filterWidth + j];
color += (d_const_Gaussian_5_5[i * filterWidth + j] * static_cast<float>(src[clamp_y * width + clamp_x]));
}
}
dst[ind] = color;
}