CUDA/GL Interop and texture copies, BGRA getting flipped to RGBA?

Short Version

I was wondering if anybody knows if the CUDA driver does anything explicit in terms of modifying OpenGL state with respect to textures. I’ve got the code setup to either do some Device -> Device transfers for CUDA, or non-CUDA just do OpenGL’s glTexImage2D.

I had gotten everything setup to use BGRA, but the CUDA version seems to somehow get messed up whereas the CPU version (since glTexImage2D is going to re-specify the storage types) will work out just fine. If I turn the knobs to make everything RGBA (including reordering actual texture source storage), both work exactly as expected.

Long Version

There’s a lot more going on here, and it’s quite possible there is a bug somewhere else. But I can’t help but feel like the cuda graphics resource is somehow changing the state associated with the texture. Sorry if this is too much information:

// helper struct that gets passed around
struct DataFeed {
    GLenum mTexUnit = 0;
    GLuint mTex = 0;

    #if defined(USE_CUDA)
        cudaArray *mCudaTexArray = nullptr;
        cudaGraphicsResource_t mCudaTexGraphicsResource = nullptr;
        cudaStream_t mStream = nullptr;
    #endif
};

Upon creation of the viewing program, everything is setup / textures are allocated once. As far as I understand it, this first glTexImage2D is what should be defining BGRA for both OpenGL and CUDA.

glActiveTexture(feed.mTexUnit);// i don't think this is necessary here 
glGenTextures(1, &feed.mTex);
glBindTexture(GL_TEXTURE_2D, feed.mTex);

glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);

// zero out the texture
glTexImage2D(
    GL_TEXTURE_2D, 0, Params::fmt::internal_format, // internal_format := GL_RGBA
    Params::width, Params::height, 0,
    Params::fmt::format, Params::fmt::gl_type, nullptr // format  := GL_BGRA,
                                                       // gl_type := GL_UNSIGNED_BYTE
);

#if defined(USE_CUDA)
    // Now that the textures have been created, map the resources for use in CUDA
    checkCudaExcept(cudaGraphicsGLRegisterImage(
        &feed.mCudaTexGraphicsResource,
        feed.mTex,
        GL_TEXTURE_2D,
        // we perform device -> device copies over the texture, where the
        // texture memory is sent to CUDA first (memcpy elsewhere), so when this
        // resource is used it's OK to obliterate
        cudaGraphicsRegisterFlagsWriteDiscard
    ));
    // Map the resource
    checkCudaExcept(cudaGraphicsMapResources(
        1, &feed.mCudaTexGraphicsResource, feed.mStream
    ));
    // Specify how this resource is used
    checkCudaExcept(cudaGraphicsSubResourceGetMappedArray(
        &feed.mCudaTexArray,
        feed.mCudaTexGraphicsResource,
        0,// array index
        0 // mip map level
    ));
    // Unmap the resource
    checkCudaExcept(cudaGraphicsUnmapResources(
        1, &feed.mCudaTexGraphicsResource, feed.mStream
    ));
#endif // USE_CUDA

So during the actual application drawing, things look like this

mSharedFeedShader.bind();

glActiveTexture(curr_feed->mTexUnit);

// setup some shader uniforms etc

glBindTexture(GL_TEXTURE_2D, curr_feed->mTex);

// if curr_data is not nullptr, then a new frame needs to be copied
if(curr_data) {
    // if we are using CUDA, the data feed is already on the device.  use the PBOs
    // to perform device -> device memory transfer from CUDA land to textures that
    // OpenGL can use to render
    #if defined(USE_CUDA)
        checkCudaExcept(cudaMemcpyToArrayAsync(
            curr_feed->mCudaTexArray, // destination
            0, 0,                     // x, y offsets
            curr_data,                // source
            Params::numBytes,         // size in bytes
            cudaMemcpyDeviceToDevice, // transfer kind
            curr_feed->mStream        // the stream this data is part of
        ));
    #else
        // for non-cuda, just copy the data up to the GPU using OpenGL
        glTexImage2D(
            GL_TEXTURE_2D, 0, Params::fmt::internal_format,
            Params::width, Params::height, 0, Params::fmt::format,
            Params::fmt::gl_type, curr_data
        );
    #endif // USE_CUDA
}

// either the data is the same (no copy), or the data was just copied.  now draw
mSharedFeedShader.drawIndexed(GL_TRIANGLES, 0, 2);

what am I missing here? Thanks for any insight!

Did you ever solved this? I’m stuck at the same spot more or less…