Hello,
I have a real-time interactive 3D application with an OpenGL rendering pipeline that runs on Windows. I want to use rendered OpenGL framebuffer textures as input to a low latency NvEncoder each frame and output the encoded memory to RAM/CPU.
I am under the impression that this is not possible in Windows without first moving the OpenGL-memory to CUDA-memory, any guess on how expensive such a memory copy is?
Is it possible for cuda to know when OpenGL has finished rendering the frame, i.e. in order to copy the memory and kick-off the encoding precisely when rendering is finished? I would like to queue the CUDA encoding instructions after all OpenGL rendering calls I suppose.
EDIT:
I attempted this and it seems to work out.
//Init
glGenBuffers(1, &m_glPixelBuffer);
glBindBuffer(GL_PIXEL_PACK_BUFFER, m_glPixelBuffer);
glBufferData(GL_PIXEL_PACK_BUFFER, m_nRenderWidth * m_nRenderHeight * 4, NULL, GL_STREAM_COPY);
//Register
cudaGraphicsGLRegisterBuffer(&m_cudaGraphicsResource, m_glPixelBuffer, cudaGraphicsRegisterFlagsReadOnly);
//
//Render to m_testTexture
//
//Copy texture memory to a GL_PIXEL_PACK_BUFFER (Seems to be required)
glBindBuffer(GL_PIXEL_PACK_BUFFER, m_glPixelBuffer);
glBindTexture(GL_TEXTURE_2D, m_testTexture);
glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0);
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
//Make sure all GL commands are complete before cudamemcpy
GLsync sync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
glWaitSync(sync, 0, GL_TIMEOUT_IGNORED);
//Copy from pixel buffer to cuda
CopyGLtoCuda(m_cudaGraphicsResource, copySize);
//From graphics resource to encoder cuda memory
void EncLowLatency::CopyGLtoCuda(cudaGraphicsResource* _ptr, size_t _size)
{
ck(cudaGraphicsMapResources(1, &_ptr));
void* devPtr = 0;
size_t availableSize = 0;
cudaError_t err = cudaGraphicsResourceGetMappedPointer(&devPtr, &availableSize, _ptr);
ck(cuMemcpyDtoD(m_cudaMemory, (CUdeviceptr)devPtr, (uint32_t)_size));
ck(cudaGraphicsUnmapResources(1, (cudaGraphicsResource_t*)&_ptr));
}
On 1080p with a GTX 1080 I get around 3ms overhead from the memory copy, and the encoder thread runs at 6ms per frame it seems.
Any suggestions on how to improve the copy speed?