Passing source pointer from OpenGL texture to cuda kernel?

Greetings All,

I have a question regarding performance when interop between cuda and opengl is involved, and secondarily I was hoping to get a clarification on how to read/write to an opengl texture from a cuda thread.

What i have working currently is a standard case: a gpu byte array which is mapped in cuda and is copied to an opengl texture via cudaMemcpyToArray. The steps I use for this are:

//opengl portion
glGenTextures(1, &TextureHandle);
glBindTexture(GL_TEXTURE_2D, TextureHandle);
glTexImage2D(GL_TEXTURE_2D, 0, Settings.StorageFormat, Size.Width, Size.Height, 0, Settings.UsageFormat, Settings.DataType, NULL);

//Cuda portion
cudaGraphicsGLRegisterImage(&CudaRes, TextureHandle, GL_TEXTURE_2D, cudaGraphicsMapFlagsNone);
cudaArray *texture_ptr;
cudaGraphicsMapResources(1, &CudaRes);
cudaGraphicsSubResourceGetMappedArray(&texture_ptr, CudaRes, 0, 0);
cudaMemcpyToArray( texture_ptr, 0, 0, GpuImage->data(), GpuImage->bytes(), cudaMemcpyDeviceToDevice);
cudaGraphicsUnmapResources(1, &CudaRes);

This works just fine and renders just fine. The change I am trying to make and understand is how to get access to the opengl texture directly without the use of a secondary buffer. My kernel is relatively straight forward and should write a constant color:

__global__ void cuTestTexture(
        core::float4* d_colorBuffer,
        /* in */
        const core::float4* colorInfo,
        const size_t stride,
        const core::uint numberOfPoints
    const unsigned int IDx = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int IDy = blockIdx.y * blockDim.y + threadIdx.y;
    const unsigned int pixelID = INDEX_2D(IDx, IDy, stride);

    if( pixelID >= numberOfPoints )

    d_colorBuffer[pixelID] = core::float4(1, 0.5f, 0, 1);

I have tried calling cudaGraphicsSubResourceGetMappedArray, which gives me a cudaArray* but not sure how to pass this to my kernel as a float4* and trying to get the pointer to the first byte of the texture via: cudaGraphicsResourceGetMappedPointer returns an error unknown.

This presentation:

starting at around slide 22, demonstrates how to have a CUDA kernel write data that shows up in a OpenGL texture, without any extra buffer copying.

Appreciate the link txbob!

I noticed in the slides, they create a pixel buffer object and generate a texture:

Slide 23:glBufferData(GL_PIXEL_UNPACK_BUFFER, Width * Height * 4,
Slide 24: glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, Width, Height, 0, GL_BGRA,

Wouldn’t this actually create twice the memory anyways? And thus, the Pixel Buffer Object just becomes a gpu owned byte array in the end?

I didn’t end up using a pixel buffer object though as the GPU Tech Conference describes, so below are my findings as it may help someone get up and running by using the cudaSurfaceObject compute features. For two textures of 640x480, I was getting ~1800+ frames per second on a GTX 780.

//Creating the OpenGL texture:
glGenTextures(1, &TextureHandle);
glBindTexture(GL_TEXTURE_2D, TextureHandle);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, Settings.MinFilter);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, Settings.MagFilter);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, Settings.WrapModeS);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, Settings.WrapModeT);

// At this point you can do several things
// Upload a cpu based image:
//glTexImage2D(GL_TEXTURE_2D, 0, Settings.StorageFormat, Size.Width, Size.Height, 0, Settings.UsageFormat, Settings.DataType, image->RawPixels());
// Upload a gpu based image:
// We pass NULL because there is no CPU bound buffer, this will do the storage copy since it is on the GPU, and openGL has no API which can take in a GPU device ptr to memory, that part is handled by cuda API calls
//glTexImage2D(GL_TEXTURE_2D, 0, Settings.StorageFormat, GpuImage->width(), GpuImage->height(), 0, Settings.UsageFormat, Settings.DataType, NULL); //0 for 0 border

//What we want really is to create the memory only once:
glTexImage2D(GL_TEXTURE_2D, 0, Settings.StorageFormat, Size.Width, Size.Height, 0, Settings.UsageFormat, Settings.DataType, NULL);

//Register texture for usage in Cuda: Load Store flag is important, w
//We plan to map to a surface object for writing
cudaGraphicsGLRegisterImage(&CudaRes, TextureHandle, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsSurfaceLoadStore);

//At some later time we update pixels via cuda:
glBindTexture(GL_TEXTURE_2D, TextureHandle);
cudaArray* texture_ptr;
cudaGraphicsMapResources(1, &CudaRes, 0);
cudaGraphicsSubResourceGetMappedArray(&texture_ptr, CudaRes, 0, 0);

//Pass texture_ptr to a launch kernel, global c function
krnlTestTexture( grid, block, &textureData, ... );

//Inside kernel launch, create description and surface object for writing:
struct cudaResourceDesc description;
memset(&description, 0, sizeof(description));
description.resType = cudaResourceTypeArray;
description.res.array.array = *openGlTexture;

cudaSurfaceObject_t target;
cudaError_t error = cudaCreateSurfaceObject(&target, &description);

//now we can launch kernel with the cudaSurfaceObject created from the cudaArray
cuTestTexture<<<grid,dim>>>( target, ... );

//Texture kernel, knowledge of the texture format can be passed in, but this going to be important.
//As surf2Dwrite's parameters are byte offsets and you need intimate knowledge of how the texture
//memory is laid out... For my case, I am using a GL_RGBA. I want to write a 'pixel' as separated 
//4 byte channels. sizeof(uchar4) == sizeof(float) so they point to the same memory

__global__ void cuTestTexture(
        cudaSurfaceObject_t target,
        /* in */
        const core::float4* colorInfo,
        const size_t stride,
        const core::uint numberOfPoints,
        const int frameCount
    const unsigned int IDx = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int IDy = blockIdx.y * blockDim.y + threadIdx.y;
    uchar4 data = make_uchar4(IDx%255, IDy%255, 0x00, 0xff);
    surf2Dwrite(data, target, IDx * sizeof(float), IDy);

//Unmap cuda
//Unbind openGL

//Example usage:
        ui::GPUTexture* texture = (ui::GPUTexture*)ui::TextureManager::GetInstance()->GetTexture("TestTexture");
        if( texture == NULL )

            ui::TextureSettings textureSettings( GL_LINEAR, GL_LINEAR, GL_RGBA, GL_RGBA, GL_FLOAT, false, false);
            geometry::Size textureSize = geometry::Size( d_colorMap->width(), d_colorMap->height());
            ui::GPUTexture* mutableTexture = new ui::GPUTexture( textureSize, textureSettings);
            ui::TextureManager::GetInstance()->AddTexture("TestTexture", mutableTexture);
            mutableTexture->Release(); //give ownership to manager
            cudaArray* textureData = texture->BindForCuda();

            dim3 grid;
            dim3 block;
            cuda::CudaDimsFromImage(grid, block, d_colorMap->width(), d_colorMap->height());
            int count = d_colorMap->width() * d_colorMap->height();
            krnlTestTexture( grid, block, &textureData, (const core::float4*)d_colorMap->data(), d_colorMap->stride(), count, FrameCount);



//Rendering in openGL, I use shaders, so ultimately I want this texture to be passed as a sampler2D.
//This is just standard openGL stuff, but putting here for reference as a reminder on its 'consumption'
glUseProgram( ShaderHandle );
glBindTexture(GL_TEXTURE_2D, TextureHandle);
//send interleaved vertices of Sprite, Mesh, vertex attributes for me are Position and UV
glVertexAttribPointer(vertAttribID, 3, GL_FLOAT, GL_FALSE, vertFormat->StrideSize(), (void*)(vertFormat->PositionOffset()));
glVertexAttribPointer(uvID, 2, GL_FLOAT, GL_FALSE, vertFormat->StrideSize(), (void*)vertFormat->UvOffset());
glDrawArrays(DrawMode->TriangleMode, 0, vertexCount);

//And then in the shader you can read from this texture :
#version 330

uniform sampler2D Texture;

in vec2 VertexUV;

out vec4 FinalColor;

void main(void)
	FinalColor = texture(Texture, VertexUV);