Passing source pointer from OpenGL texture to cuda kernel?

Rhapsodus · October 18, 2017, 12:04am

Greetings All,

I have a question regarding performance when interop between cuda and opengl is involved, and secondarily I was hoping to get a clarification on how to read/write to an opengl texture from a cuda thread.

What i have working currently is a standard case: a gpu byte array which is mapped in cuda and is copied to an opengl texture via cudaMemcpyToArray. The steps I use for this are:

//opengl portion
glGenTextures(1, &TextureHandle);
glBindTexture(GL_TEXTURE_2D, TextureHandle);
glTexImage2D(GL_TEXTURE_2D, 0, Settings.StorageFormat, Size.Width, Size.Height, 0, Settings.UsageFormat, Settings.DataType, NULL);

//Cuda portion
cudaGraphicsGLRegisterImage(&CudaRes, TextureHandle, GL_TEXTURE_2D, cudaGraphicsMapFlagsNone);
cudaArray *texture_ptr;
cudaGraphicsMapResources(1, &CudaRes);
cudaGraphicsSubResourceGetMappedArray(&texture_ptr, CudaRes, 0, 0);
cudaMemcpyToArray( texture_ptr, 0, 0, GpuImage->data(), GpuImage->bytes(), cudaMemcpyDeviceToDevice);
cudaGraphicsUnmapResources(1, &CudaRes);

This works just fine and renders just fine. The change I am trying to make and understand is how to get access to the opengl texture directly without the use of a secondary buffer. My kernel is relatively straight forward and should write a constant color:

__global__ void cuTestTexture(
        /*out*/
        core::float4* d_colorBuffer,
        /* in */
        const core::float4* colorInfo,
        const size_t stride,
        const core::uint numberOfPoints
        )
{
    const unsigned int IDx = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int IDy = blockIdx.y * blockDim.y + threadIdx.y;
    const unsigned int pixelID = INDEX_2D(IDx, IDy, stride);

    if( pixelID >= numberOfPoints )
        return;

    d_colorBuffer[pixelID] = core::float4(1, 0.5f, 0, 1);
}

I have tried calling cudaGraphicsSubResourceGetMappedArray, which gives me a cudaArray* but not sure how to pass this to my kernel as a float4* and trying to get the pointer to the first byte of the texture via: cudaGraphicsResourceGetMappedPointer returns an error unknown.

Robert_Crovella · October 19, 2017, 1:48pm

This presentation:

[url]Page Not Found | NVIDIA

starting at around slide 22, demonstrates how to have a CUDA kernel write data that shows up in a OpenGL texture, without any extra buffer copying.

Rhapsodus · October 19, 2017, 5:34pm

Appreciate the link txbob!

I noticed in the slides, they create a pixel buffer object and generate a texture:

Slide 23:glBufferData(GL_PIXEL_UNPACK_BUFFER, Width * Height * 4,
NULL, GL_DYNAMIC_COPY);
Slide 24: glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, Width, Height, 0, GL_BGRA,
GL_UNSIGNED_BYTE, NULL)

Wouldn’t this actually create twice the memory anyways? And thus, the Pixel Buffer Object just becomes a gpu owned byte array in the end?

I didn’t end up using a pixel buffer object though as the GPU Tech Conference describes, so below are my findings as it may help someone get up and running by using the cudaSurfaceObject compute features. For two textures of 640x480, I was getting ~1800+ frames per second on a GTX 780.

//Creating the OpenGL texture:
glGenTextures(1, &TextureHandle);
glBindTexture(GL_TEXTURE_2D, TextureHandle);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, Settings.MinFilter);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, Settings.MagFilter);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, Settings.WrapModeS);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, Settings.WrapModeT);
//etc...

// At this point you can do several things
// Upload a cpu based image:
//glTexImage2D(GL_TEXTURE_2D, 0, Settings.StorageFormat, Size.Width, Size.Height, 0, Settings.UsageFormat, Settings.DataType, image->RawPixels());
// Upload a gpu based image:
// We pass NULL because there is no CPU bound buffer, this will do the storage copy since it is on the GPU, and openGL has no API which can take in a GPU device ptr to memory, that part is handled by cuda API calls
//glTexImage2D(GL_TEXTURE_2D, 0, Settings.StorageFormat, GpuImage->width(), GpuImage->height(), 0, Settings.UsageFormat, Settings.DataType, NULL); //0 for 0 border

//What we want really is to create the memory only once:
glTexImage2D(GL_TEXTURE_2D, 0, Settings.StorageFormat, Size.Width, Size.Height, 0, Settings.UsageFormat, Settings.DataType, NULL);

//Register texture for usage in Cuda: Load Store flag is important, w
//We plan to map to a surface object for writing
cudaGraphicsGLRegisterImage(&CudaRes, TextureHandle, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsSurfaceLoadStore);

//At some later time we update pixels via cuda:
glBindTexture(GL_TEXTURE_2D, TextureHandle);
cudaArray* texture_ptr;
cudaGraphicsMapResources(1, &CudaRes, 0);
cudaGraphicsSubResourceGetMappedArray(&texture_ptr, CudaRes, 0, 0);

//Pass texture_ptr to a launch kernel, global c function
krnlTestTexture( grid, block, &textureData, ... );

//Inside kernel launch, create description and surface object for writing:
struct cudaResourceDesc description;
memset(&description, 0, sizeof(description));
description.resType = cudaResourceTypeArray;
description.res.array.array = *openGlTexture;

cudaSurfaceObject_t target;
cudaError_t error = cudaCreateSurfaceObject(&target, &description);

//now we can launch kernel with the cudaSurfaceObject created from the cudaArray
cuTestTexture<<<grid,dim>>>( target, ... );

//Texture kernel, knowledge of the texture format can be passed in, but this going to be important.
//As surf2Dwrite's parameters are byte offsets and you need intimate knowledge of how the texture
//memory is laid out... For my case, I am using a GL_RGBA. I want to write a 'pixel' as separated 
//4 byte channels. sizeof(uchar4) == sizeof(float) so they point to the same memory

__global__ void cuTestTexture(
        /*out*/
        cudaSurfaceObject_t target,
        /* in */
        const core::float4* colorInfo,
        const size_t stride,
        const core::uint numberOfPoints,
        const int frameCount
        )
{
    const unsigned int IDx = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int IDy = blockIdx.y * blockDim.y + threadIdx.y;
    uchar4 data = make_uchar4(IDx%255, IDy%255, 0x00, 0xff);
    surf2Dwrite(data, target, IDx * sizeof(float), IDy);
}

//Unmap cuda
//Unbind openGL

//Example usage:
        ui::GPUTexture* texture = (ui::GPUTexture*)ui::TextureManager::GetInstance()->GetTexture("TestTexture");
        if( texture == NULL )
        {

            ui::TextureSettings textureSettings( GL_LINEAR, GL_LINEAR, GL_RGBA, GL_RGBA, GL_FLOAT, false, false);
            geometry::Size textureSize = geometry::Size( d_colorMap->width(), d_colorMap->height());
            ui::GPUTexture* mutableTexture = new ui::GPUTexture( textureSize, textureSettings);
            ui::TextureManager::GetInstance()->AddTexture("TestTexture", mutableTexture);
            mutableTexture->Release(); //give ownership to manager
        }
        else
        {
            texture->Bind();
            cudaArray* textureData = texture->BindForCuda();

            dim3 grid;
            dim3 block;
            cuda::CudaDimsFromImage(grid, block, d_colorMap->width(), d_colorMap->height());
            int count = d_colorMap->width() * d_colorMap->height();
            krnlTestTexture( grid, block, &textureData, (const core::float4*)d_colorMap->data(), d_colorMap->stride(), count, FrameCount);

            texture->UnbindForCuda();
            texture->Unbind();

            FrameCount++;
        }

//Rendering in openGL, I use shaders, so ultimately I want this texture to be passed as a sampler2D.
//This is just standard openGL stuff, but putting here for reference as a reminder on its 'consumption'
glUseProgram( ShaderHandle );
glBindTexture(GL_TEXTURE_2D, TextureHandle);
//send interleaved vertices of Sprite, Mesh, vertex attributes for me are Position and UV
glEnableVertexAttribArray(vertAttribID);
glVertexAttribPointer(vertAttribID, 3, GL_FLOAT, GL_FALSE, vertFormat->StrideSize(), (void*)(vertFormat->PositionOffset()));
glEnableVertexAttribArray(uvID);
glVertexAttribPointer(uvID, 2, GL_FLOAT, GL_FALSE, vertFormat->StrideSize(), (void*)vertFormat->UvOffset());
glDrawArrays(DrawMode->TriangleMode, 0, vertexCount);

//And then in the shader you can read from this texture :
#version 330

uniform sampler2D Texture;

in vec2 VertexUV;

out vec4 FinalColor;

void main(void)
{
	FinalColor = texture(Texture, VertexUV);
}

Topic		Replies	Views
Reading and Writing OpenGL Textures with Cuda CUDA Programming and Performance	24	24965	June 26, 2023
OpenGL interop: Reading from and writing to surface CUDA Programming and Performance	8	3350	December 16, 2015
render to Texture help needed I need help with rendering with cuda to a OPenGL usable texture CUDA Programming and Performance	9	2366	September 28, 2010
display a buffer openGL/cuda question CUDA Programming and Performance	11	8176	May 13, 2008
Help me for cudaGraphicsGLRegisterImage CUDA Programming and Performance	6	6619	June 10, 2010
Reading OpenGL texture data from CUDA CUDA Programming and Performance	10	3184	January 17, 2020
QGLWidget + OpenGL + CUDA CUDA Programming and Performance	3	11357	October 20, 2010
Pass openGL data to CUDA. Question about speed. CUDA Programming and Performance	4	1876	August 22, 2016
Faster method of shifting texture data CUDA Programming and Performance cuda	11	492	July 31, 2023
CUDA and OpenGL - Beginner question CUDA Programming and Performance	15	4468	March 6, 2015

Passing source pointer from OpenGL texture to cuda kernel?

Related topics