Hello,
I’ve got a very basic performance issue. The kernel in the code below needs 18ms to accomplish, even in this very simplified version. The occupancy of the kernel is theoretical 100%, practical 86%.
Functions implemented in OpenCV runnig on adequate speed, e.g. the cuda::gammaCorrection and the cuda::demosaicing both take just about 2ms per call.
After reading some other topics I think the problem is something with the memory accesses or the pointers to the GpuMats, but I am not sure how to fix it.
I am using Visual Studio 13 with Nsight and OpenCV 3.0.0 with Cuda 7.5 as it is availiable from http://sourceforge.net/projects/opencvprebuilt/. My Machine: i7-5930K, 32GB DDR4, Titan X, Win 7x64.
Thank you for your help!
#include <opencv2\opencv.hpp>
#include <opencv2\core\cuda.hpp>
#include <opencv2\core\opengl.hpp>
#include <opencv2\cudaimgproc.hpp>
#include <opencv2\cudawarping.hpp>
#include <opencv2\cudalegacy.hpp>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
cuda::GpuMat d_mapx;
cuda::GpuMat d_mapy;
const Size size_sensor(5120, 3840);
void main(void)
{
int userQuit = 0;
cudaSetDevice(0);
d_mapx = cuda::GpuMat(size_sensor, CV_32FC1);
d_mapy = cuda::GpuMat(size_sensor, CV_32FC1);
while (!userQuit)
{
createMapCuda();
//other code
}
}
__global__ void createMapKernel( cuda::PtrStepSzf d_mapx, cuda::PtrStepSzf d_mapy, const int cols, const int rows, const float half_cols, const float half_rows, const float da, const float db)
{
int dx = blockDim.x * blockIdx.x + threadIdx.x;
int dy = blockDim.y * blockIdx.y + threadIdx.y;
//Next two lines are the place holders for the real algorithm,
//which writes the results back in the same way
d_mapx.ptr(dy)[dx] = dx;
d_mapy.ptr(dy)[dx] = dy;
}
void createMapCuda(void)
{
static float a;
static float b;
static float cos_i;
static int cols = size_sensor.width;
static int rows = size_sensor.height;
static float half_rows = size_sensor.height / 2;
static float half_cols = size_sensor.width / 2;
a = (0.01 *f_distortionA);
b = (0.01 *f_distortionB);
static const dim3 dimBlock(16, 16);
static const dim3 dimGrid((int)ceil(cols / dimBlock.x), (int)ceil(rows / dimBlock.y));
createMapKernel << <dimGrid, dimBlock >> >(d_mapx, d_mapy, cols, rows, half_cols, half_rows, a, b);
cudaSafeCall(cudaGetLastError());
cudaSafeCall(cudaDeviceSynchronize());
}