Buffer performance warning GL_PIXEL_UNPACK_BUFFER_ARB when uploading to depth texture

mark366 · March 26, 2025, 8:09pm

My application is trying to directly set an OpenGL depth texture based on the results of a CUDA kernel. The kernel renders into a GL_PIXEL_UNPACK_BUFFER which is mapped with the cudaGraphicsMapResources. When I try to upload to the depth texture, I trigger the following warning.

Buffer performance warning: Buffer object 1 (bound to GL_PIXEL_UNPACK_BUFFER_ARB, usage hint is GL_STREAM_DRAW) is being copied/moved from VIDEO memory to HOST memory.

Here is a simplified outline of my application, where the CUDA rendering step has been replaced with a simple cudaMemset. The function init_gl() should be called once the OpenGL context has been initialized, and then work() is called in a loop.

const int width = 256;
const int height = 256;

cudaGraphicsResource_t graphics_resource = 0;
GLuint depth_texture_id = 0;
GLuint pbo_id = 0;

void init_gl() {
    // initialize depth texture
    glGenTextures(1, &depth_texture_id);
    glBindTexture(GL_TEXTURE_2D, depth_texture_id);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_BORDER);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_BORDER);
    glTexImage2D(GL_TEXTURE_2D,
                 /*level=*/0,
                 /*internal_format=*/GL_DEPTH_COMPONENT32F, width, height,
                 /*border=*/0,
                 /*format=*/GL_DEPTH_COMPONENT,
                 /*type=*/GL_FLOAT,
                 /*data=*/nullptr);
    glBindTexture(GL_TEXTURE_2D, 0);

    // initialize PBO
    glGenBuffers(1, &pbo_id);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_id);
    glBufferData(GL_PIXEL_UNPACK_BUFFER, width * height * sizeof(float),
                 nullptr, GL_STREAM_DRAW);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

    // register the PBO with cuda
    CUDACHECK(cudaGraphicsGLRegisterBuffer(
        &graphics_resource, pbo_id, cudaGraphicsRegisterFlagsWriteDiscard));
}

void work() {
    // map the pbo
    CUDACHECK(cudaGraphicsMapResources(1, &graphics_resource));
    size_t bufsize = 0;
    void* device_ptr = nullptr;
    CUDACHECK(cudaGraphicsResourceGetMappedPointer(&device_ptr, &bufsize,
                                                   graphics_resource));

    // write to the pbo
    cudaMemset(device_ptr, 0, bufsize);

    // unmap
    CUDACHECK(cudaGraphicsUnmapResources(1, &graphics_resource));
    device_ptr = nullptr;

    // upload pbo to the texture
    glPixelStorei(GL_PACK_ALIGNMENT, 1);
    glBindTexture(GL_TEXTURE_2D, depth_texture_id);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_id);
    glTexSubImage2D(GL_TEXTURE_2D,
                    /*level=*/0,
                    /*xoffset=*/0,
                    /*yoffset=*/0, width, height,
                    /*format=*/GL_DEPTH_COMPONENT,
                    /*type=*/GL_FLOAT,
                    /*data=*/nullptr);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
    glBindTexture(GL_TEXTURE_2D, 0);
}

I would like to avoid copying the pixel buffer to host memory, and it’s unclear why the driver is doing this copy. This issue does not seem to occur for color buffers (GL_RGB).

mark366 · March 27, 2025, 11:06pm

Just popping in to share a work-around which seems to speed up the rendering. Instead of uploading to a GL_DEPTH_COMPONENT texture, upload the PBO to a GL_RED texture. Then write a simple shader that directly sets the gl_FragDepth and render both color and depth textures to a full-screen quad.

Here is the fragment shader I used.

#version 430
out vec4 output_color;
in vec2 texcoord;

uniform sampler2D input_color;
uniform sampler2D input_depth;
uniform bool flip_y = false;
uniform bool flip_x = false;

// we need to convert the metric depth values into normalized depths
// inside the view frustum. these values are the (2,2) and (2,3) of
// the standard 4x4 projection matrices, eg.
//
// fx,  0,          cx, 0
//  0, fy,          cy, 0
//  0,  0, depth_scale, depth_offset
//  0,  0,           1  0

uniform float depth_scale = 1.0002;
uniform float depth_offset = -0.020002;

void main()
{
    vec2 samplecoord = texcoord;
    if (flip_y) {
        samplecoord = vec2(samplecoord.x, 1.0 - samplecoord.y);
    }
    if (flip_x) {
        samplecoord = vec2(1.0 - samplecoord.x, samplecoord.y);
    }

    float depth = texture(input_depth, samplecoord).r;
    float normalized_depth = (depth*depth_scale + depth_offset)/depth;

    // the normalized depth is in NDC coordinates [-1, 1]
    // to move to screen coordinates, we need to scale to [0, 1]
    // assuming the near and far planes are at -1, and 1
    
    gl_FragDepth = (normalized_depth + 1)/2;

    output_color = texture(input_color, samplecoord);

}

Topic		Replies	Views
display a buffer openGL/cuda question CUDA Programming and Performance	11	8155	May 13, 2008
Passing source pointer from OpenGL texture to cuda kernel? CUDA Programming and Performance	2	3303	October 19, 2017
Help me for cudaGraphicsGLRegisterImage CUDA Programming and Performance	6	6613	June 10, 2010
cudaGraphicsGLRegisterBuffer and unspecified driver error CUDA Programming and Performance	5	4613	September 22, 2011
OpenGL Interoperability (texture) OpenGL	2	860	December 4, 2018
Map OpenGL depth buffer in CUDA kernel CUDA Programming and Performance	3	1931	September 8, 2016
OpenGL & CUDA CUDA Programming and Performance	12	9847	January 16, 2009
Pass openGL data to CUDA. Question about speed. CUDA Programming and Performance	4	1873	August 22, 2016
OpenGL Error CUDA Programming and Performance	6	5653	November 15, 2011
cudaMemcpy2DFromArray resulting in cudaErrorInvalidValue CUDA Setup and Installation	0	1298	June 19, 2014

Buffer performance warning GL_PIXEL_UNPACK_BUFFER_ARB when uploading to depth texture

Related topics