Ok I created an input and output buffer with cudaAllocMapped to store the greyscale data and copied the NvBuffer into it with NvBuffer2Raw. Everything seems to be working.
Is there a better way of getting that data from the NvBuffer to a GpuMat?
void *DisparitySink::process(void *priv)
{
DisparitySink *ctx = static_cast<DisparitySink*>(priv);
CUcontext cuda_ctx = 0;
CUresult status;
// Allow zero copy access
cudaSetDeviceFlags(cudaDeviceMapHost);
try {
status = cuInit(0);
if(status != CUDA_SUCCESS)
throw status;
CUdevice dev;
status = cuDeviceGet(&dev, 0);
if(status != CUDA_SUCCESS)
throw status;
status = cuCtxCreate(&cuda_ctx, 0, dev);
if(status != CUDA_SUCCESS)
throw status;
}
catch (CUresult &status) {
const char *error;
cuGetErrorString(status, &error);
DEBUG_ERROR(error);
ctx->set_state(State::eos);
return nullptr;
}
cuda::GpuMat mapx;
cuda::GpuMat mapy;
Mat K, R, P;
Vec4d D;
FileStorage storage("1920x1080.yml", FileStorage::READ);
storage["K0"] >> K;
storage["D0"] >> D;
storage["R0"] >> R;
storage["P0"] >> P;
storage.release();
Mat cpu_mapx, cpu_mapy;
fisheye::initUndistortRectifyMap(
K, D, R, P, Size(ctx->m_width, ctx->m_height), CV_32FC1, cpu_mapx, cpu_mapy);
mapx.upload(cpu_mapx);
mapy.upload(cpu_mapy);
void *input_cpu = nullptr;
void *input_cuda = nullptr;
cudaAllocMapped(&input_cpu, &input_cuda, ctx->m_width*ctx->m_height);
cuda::GpuMat cv_in(ctx->m_height, ctx->m_width, CV_8UC1, input_cuda);
void *output_cpu = nullptr;
void *output_cuda = nullptr;
cudaAllocMapped(&output_cpu, &output_cuda, ctx->m_width*ctx->m_height);
cuda::GpuMat cv_out(ctx->m_height, ctx->m_width, CV_8UC1, output_cuda);
while(1) {
struct v4l2_buffer v4l2_buf;
struct v4l2_plane planes[MAX_PLANES];
memset(&v4l2_buf, 0, sizeof(v4l2_buf));
memset(planes, 0, sizeof(planes));
v4l2_buf.m.planes = planes;
pthread_mutex_lock(&ctx->m_capture_lock);
while(ctx->m_capture_queue->empty()) {
pthread_cond_wait(&ctx->m_capture_cond, &ctx->m_capture_lock);
}
NvBuffer *buffer = ctx->m_capture_queue->front().first;
struct timeval ts = ctx->m_capture_queue->front().second;
ctx->m_capture_queue->pop();
pthread_mutex_unlock(&ctx->m_capture_lock);
if(buffer->planes[0].bytesused == 0)
break;
v4l2_buf.index = buffer->index;
// Copy NvBuffer to mapped buffer
NvBuffer2Raw(
buffer->planes[0].fd, 0, ctx->m_width, ctx->m_height, static_cast<uint8_t*>(input_cuda));
cuda::remap(cv_in, cv_out, mapx, mapy, INTER_LINEAR);
// Re-queue MMAP buffer on capture plane
if(ctx->m_conv->capture_plane.qBuffer(v4l2_buf, nullptr) < 0) {
DEBUG_ERROR("failed to queue buffer");
ctx->m_conv->abort();
break;
}
// Write out image
if(ctx->m_write_flag) {
capture_frame(output_cpu, ctx->m_width, ctx->m_height, "cv_out.png");
ctx->m_write_flag = false;
}
}
status = cuCtxDestroy(cuda_ctx);
if(status != CUDA_SUCCESS)
DEBUG_WARN("unable to destroy CUDA context");
ctx->m_conv->capture_plane.waitForDQThread(2000);
DEBUG_VERBOSE("disparity thread finished");
return nullptr;
}