Buffer performance warning GL_PIXEL_UNPACK_BUFFER_ARB when uploading to depth texture

My application is trying to directly set an OpenGL depth texture based on the results of a CUDA kernel. The kernel renders into a GL_PIXEL_UNPACK_BUFFER which is mapped with the cudaGraphicsMapResources. When I try to upload to the depth texture, I trigger the following warning.

Buffer performance warning: Buffer object 1 (bound to GL_PIXEL_UNPACK_BUFFER_ARB, usage hint is GL_STREAM_DRAW) is being copied/moved from VIDEO memory to HOST memory.

Here is a simplified outline of my application, where the CUDA rendering step has been replaced with a simple cudaMemset. The function init_gl() should be called once the OpenGL context has been initialized, and then work() is called in a loop.

const int width = 256;
const int height = 256;

cudaGraphicsResource_t graphics_resource = 0;
GLuint depth_texture_id = 0;
GLuint pbo_id = 0;

void init_gl() {
    // initialize depth texture
    glGenTextures(1, &depth_texture_id);
    glBindTexture(GL_TEXTURE_2D, depth_texture_id);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_BORDER);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_BORDER);
    glTexImage2D(GL_TEXTURE_2D,
                 /*level=*/0,
                 /*internal_format=*/GL_DEPTH_COMPONENT32F, width, height,
                 /*border=*/0,
                 /*format=*/GL_DEPTH_COMPONENT,
                 /*type=*/GL_FLOAT,
                 /*data=*/nullptr);
    glBindTexture(GL_TEXTURE_2D, 0);

    // initialize PBO
    glGenBuffers(1, &pbo_id);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_id);
    glBufferData(GL_PIXEL_UNPACK_BUFFER, width * height * sizeof(float),
                 nullptr, GL_STREAM_DRAW);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

    // register the PBO with cuda
    CUDACHECK(cudaGraphicsGLRegisterBuffer(
        &graphics_resource, pbo_id, cudaGraphicsRegisterFlagsWriteDiscard));
}

void work() {
    // map the pbo
    CUDACHECK(cudaGraphicsMapResources(1, &graphics_resource));
    size_t bufsize = 0;
    void* device_ptr = nullptr;
    CUDACHECK(cudaGraphicsResourceGetMappedPointer(&device_ptr, &bufsize,
                                                   graphics_resource));

    // write to the pbo
    cudaMemset(device_ptr, 0, bufsize);

    // unmap
    CUDACHECK(cudaGraphicsUnmapResources(1, &graphics_resource));
    device_ptr = nullptr;

    // upload pbo to the texture
    glPixelStorei(GL_PACK_ALIGNMENT, 1);
    glBindTexture(GL_TEXTURE_2D, depth_texture_id);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_id);
    glTexSubImage2D(GL_TEXTURE_2D,
                    /*level=*/0,
                    /*xoffset=*/0,
                    /*yoffset=*/0, width, height,
                    /*format=*/GL_DEPTH_COMPONENT,
                    /*type=*/GL_FLOAT,
                    /*data=*/nullptr);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
    glBindTexture(GL_TEXTURE_2D, 0);
}

I would like to avoid copying the pixel buffer to host memory, and it’s unclear why the driver is doing this copy. This issue does not seem to occur for color buffers (GL_RGB).

Just popping in to share a work-around which seems to speed up the rendering. Instead of uploading to a GL_DEPTH_COMPONENT texture, upload the PBO to a GL_RED texture. Then write a simple shader that directly sets the gl_FragDepth and render both color and depth textures to a full-screen quad.

Here is the fragment shader I used.

#version 430
out vec4 output_color;
in vec2 texcoord;

uniform sampler2D input_color;
uniform sampler2D input_depth;
uniform bool flip_y = false;
uniform bool flip_x = false;

// we need to convert the metric depth values into normalized depths
// inside the view frustum. these values are the (2,2) and (2,3) of
// the standard 4x4 projection matrices, eg.
//
// fx,  0,          cx, 0
//  0, fy,          cy, 0
//  0,  0, depth_scale, depth_offset
//  0,  0,           1  0

uniform float depth_scale = 1.0002;
uniform float depth_offset = -0.020002;

void main()
{
    vec2 samplecoord = texcoord;
    if (flip_y) {
        samplecoord = vec2(samplecoord.x, 1.0 - samplecoord.y);
    }
    if (flip_x) {
        samplecoord = vec2(1.0 - samplecoord.x, samplecoord.y);
    }

    float depth = texture(input_depth, samplecoord).r;
    float normalized_depth = (depth*depth_scale + depth_offset)/depth;

    // the normalized depth is in NDC coordinates [-1, 1]
    // to move to screen coordinates, we need to scale to [0, 1]
    // assuming the near and far planes are at -1, and 1
    
    gl_FragDepth = (normalized_depth + 1)/2;

    output_color = texture(input_color, samplecoord);

}