Appreciate the link txbob!
I noticed in the slides, they create a pixel buffer object and generate a texture:
Slide 23:glBufferData(GL_PIXEL_UNPACK_BUFFER, Width * Height * 4,
NULL, GL_DYNAMIC_COPY);
Slide 24: glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, Width, Height, 0, GL_BGRA,
GL_UNSIGNED_BYTE, NULL)
Wouldn’t this actually create twice the memory anyways? And thus, the Pixel Buffer Object just becomes a gpu owned byte array in the end?
I didn’t end up using a pixel buffer object though as the GPU Tech Conference describes, so below are my findings as it may help someone get up and running by using the cudaSurfaceObject compute features. For two textures of 640x480, I was getting ~1800+ frames per second on a GTX 780.
//Creating the OpenGL texture:
glGenTextures(1, &TextureHandle);
glBindTexture(GL_TEXTURE_2D, TextureHandle);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, Settings.MinFilter);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, Settings.MagFilter);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, Settings.WrapModeS);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, Settings.WrapModeT);
//etc...
// At this point you can do several things
// Upload a cpu based image:
//glTexImage2D(GL_TEXTURE_2D, 0, Settings.StorageFormat, Size.Width, Size.Height, 0, Settings.UsageFormat, Settings.DataType, image->RawPixels());
// Upload a gpu based image:
// We pass NULL because there is no CPU bound buffer, this will do the storage copy since it is on the GPU, and openGL has no API which can take in a GPU device ptr to memory, that part is handled by cuda API calls
//glTexImage2D(GL_TEXTURE_2D, 0, Settings.StorageFormat, GpuImage->width(), GpuImage->height(), 0, Settings.UsageFormat, Settings.DataType, NULL); //0 for 0 border
//What we want really is to create the memory only once:
glTexImage2D(GL_TEXTURE_2D, 0, Settings.StorageFormat, Size.Width, Size.Height, 0, Settings.UsageFormat, Settings.DataType, NULL);
//Register texture for usage in Cuda: Load Store flag is important, w
//We plan to map to a surface object for writing
cudaGraphicsGLRegisterImage(&CudaRes, TextureHandle, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsSurfaceLoadStore);
//At some later time we update pixels via cuda:
glBindTexture(GL_TEXTURE_2D, TextureHandle);
cudaArray* texture_ptr;
cudaGraphicsMapResources(1, &CudaRes, 0);
cudaGraphicsSubResourceGetMappedArray(&texture_ptr, CudaRes, 0, 0);
//Pass texture_ptr to a launch kernel, global c function
krnlTestTexture( grid, block, &textureData, ... );
//Inside kernel launch, create description and surface object for writing:
struct cudaResourceDesc description;
memset(&description, 0, sizeof(description));
description.resType = cudaResourceTypeArray;
description.res.array.array = *openGlTexture;
cudaSurfaceObject_t target;
cudaError_t error = cudaCreateSurfaceObject(&target, &description);
//now we can launch kernel with the cudaSurfaceObject created from the cudaArray
cuTestTexture<<<grid,dim>>>( target, ... );
//Texture kernel, knowledge of the texture format can be passed in, but this going to be important.
//As surf2Dwrite's parameters are byte offsets and you need intimate knowledge of how the texture
//memory is laid out... For my case, I am using a GL_RGBA. I want to write a 'pixel' as separated
//4 byte channels. sizeof(uchar4) == sizeof(float) so they point to the same memory
__global__ void cuTestTexture(
/*out*/
cudaSurfaceObject_t target,
/* in */
const core::float4* colorInfo,
const size_t stride,
const core::uint numberOfPoints,
const int frameCount
)
{
const unsigned int IDx = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int IDy = blockIdx.y * blockDim.y + threadIdx.y;
uchar4 data = make_uchar4(IDx%255, IDy%255, 0x00, 0xff);
surf2Dwrite(data, target, IDx * sizeof(float), IDy);
}
//Unmap cuda
//Unbind openGL
//Example usage:
ui::GPUTexture* texture = (ui::GPUTexture*)ui::TextureManager::GetInstance()->GetTexture("TestTexture");
if( texture == NULL )
{
ui::TextureSettings textureSettings( GL_LINEAR, GL_LINEAR, GL_RGBA, GL_RGBA, GL_FLOAT, false, false);
geometry::Size textureSize = geometry::Size( d_colorMap->width(), d_colorMap->height());
ui::GPUTexture* mutableTexture = new ui::GPUTexture( textureSize, textureSettings);
ui::TextureManager::GetInstance()->AddTexture("TestTexture", mutableTexture);
mutableTexture->Release(); //give ownership to manager
}
else
{
texture->Bind();
cudaArray* textureData = texture->BindForCuda();
dim3 grid;
dim3 block;
cuda::CudaDimsFromImage(grid, block, d_colorMap->width(), d_colorMap->height());
int count = d_colorMap->width() * d_colorMap->height();
krnlTestTexture( grid, block, &textureData, (const core::float4*)d_colorMap->data(), d_colorMap->stride(), count, FrameCount);
texture->UnbindForCuda();
texture->Unbind();
FrameCount++;
}
//Rendering in openGL, I use shaders, so ultimately I want this texture to be passed as a sampler2D.
//This is just standard openGL stuff, but putting here for reference as a reminder on its 'consumption'
glUseProgram( ShaderHandle );
glBindTexture(GL_TEXTURE_2D, TextureHandle);
//send interleaved vertices of Sprite, Mesh, vertex attributes for me are Position and UV
glEnableVertexAttribArray(vertAttribID);
glVertexAttribPointer(vertAttribID, 3, GL_FLOAT, GL_FALSE, vertFormat->StrideSize(), (void*)(vertFormat->PositionOffset()));
glEnableVertexAttribArray(uvID);
glVertexAttribPointer(uvID, 2, GL_FLOAT, GL_FALSE, vertFormat->StrideSize(), (void*)vertFormat->UvOffset());
glDrawArrays(DrawMode->TriangleMode, 0, vertexCount);
//And then in the shader you can read from this texture :
#version 330
uniform sampler2D Texture;
in vec2 VertexUV;
out vec4 FinalColor;
void main(void)
{
FinalColor = texture(Texture, VertexUV);
}