Encoding OpenGL textures live on Windows

Hello,

I have a real-time interactive 3D application with an OpenGL rendering pipeline that runs on Windows. I want to use rendered OpenGL framebuffer textures as input to a low latency NvEncoder each frame and output the encoded memory to RAM/CPU.

I am under the impression that this is not possible in Windows without first moving the OpenGL-memory to CUDA-memory, any guess on how expensive such a memory copy is?

Is it possible for cuda to know when OpenGL has finished rendering the frame, i.e. in order to copy the memory and kick-off the encoding precisely when rendering is finished? I would like to queue the CUDA encoding instructions after all OpenGL rendering calls I suppose.

EDIT:

I attempted this and it seems to work out.

//Init
glGenBuffers(1, &m_glPixelBuffer);
glBindBuffer(GL_PIXEL_PACK_BUFFER, m_glPixelBuffer);
glBufferData(GL_PIXEL_PACK_BUFFER, m_nRenderWidth * m_nRenderHeight * 4, NULL, GL_STREAM_COPY);

//Register
cudaGraphicsGLRegisterBuffer(&m_cudaGraphicsResource, m_glPixelBuffer, cudaGraphicsRegisterFlagsReadOnly);

//
//Render to m_testTexture
//

//Copy texture memory to a GL_PIXEL_PACK_BUFFER (Seems to be required)
glBindBuffer(GL_PIXEL_PACK_BUFFER, m_glPixelBuffer);
glBindTexture(GL_TEXTURE_2D, m_testTexture);
glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0);
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);

//Make sure all GL commands are complete before cudamemcpy
GLsync sync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
glWaitSync(sync, 0, GL_TIMEOUT_IGNORED);

//Copy from pixel buffer to cuda
CopyGLtoCuda(m_cudaGraphicsResource, copySize);

//From graphics resource to encoder cuda memory
void EncLowLatency::CopyGLtoCuda(cudaGraphicsResource* _ptr, size_t _size)
{
ck(cudaGraphicsMapResources(1, &_ptr));

void* devPtr = 0;
size_t availableSize = 0;
cudaError_t err = cudaGraphicsResourceGetMappedPointer(&devPtr, &availableSize, _ptr);

ck(cuMemcpyDtoD(m_cudaMemory, (CUdeviceptr)devPtr, (uint32_t)_size));

ck(cudaGraphicsUnmapResources(1, (cudaGraphicsResource_t*)&_ptr));

}

On 1080p with a GTX 1080 I get around 3ms overhead from the memory copy, and the encoder thread runs at 6ms per frame it seems.

Any suggestions on how to improve the copy speed?