I have been experimenting with using CUDA for doing processing combined with OpenGL for visualization. However the performance I get is significantly lower than expected.
My sample code processes and renders 64 tiles of 256x256 RGBA8 pixels each frame. The processing is just a simple copy from texture to texture/surface. I have implemented this ‘processing’ using both Cuda and an OpenGL fragment shader.
Results on my machine (win7, NVIDIA GF 650M with 340.25 drivers) are:
- CUDA-based processing: 8.0 ms/frame
- OpenGL-based processing: 2.5 ms/frame
As you can see there is difference is more than 3x.
Doing only the CUDA processing gives me about 1.5ms/frame so the problem really seems to be related to the OpenGL interop.
Any insights or suggestions on this would be very much appreciated!
The most relevant snippet of my code is:
texture<uchar4, cudaTextureType2D, cudaReadModeElementType> texRef;
const textureReference * texRefPtr;
const cudaChannelFormatDesc texChannelDesc = cudaCreateChannelDesc<uchar4>();
surface<void, cudaSurfaceType2D> surfRef;
void initProcessing() {
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
// Init texture reference
texRef.normalized = 0;
texRef.filterMode = cudaTextureFilterMode::cudaFilterModePoint;
cudaStatus = cudaGetTextureReference(&texRefPtr, &texRef);
}
uchar4 * createInputTexture(unsigned int width, unsigned int height) {
uchar4 * dev_ptr = 0;
cudaError_t cudaStatus = cudaMalloc((void**)&dev_ptr, width * height * sizeof(uchar4));
return dev_ptr;
}
void fillInputTexture(uchar4 * dev_ptr, unsigned int width, unsigned int height, void * pixels) {
cudaError_t cudaStatus = cudaMemcpy(dev_ptr, pixels, width * height * sizeof(int), cudaMemcpyHostToDevice);
}
cudaGraphicsResource_t createGLOutputTexture(unsigned int glTexture, unsigned int width, unsigned int height) {
cudaGraphicsResource_t cudaGLTex;
cudaError_t cudaStatus = cudaGraphicsGLRegisterImage(&cudaGLTex, glTexture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsSurfaceLoadStore);
return cudaGLTex;
}
__global__ void processTile(unsigned int width, unsigned int height)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
uchar4 pixel = tex2D(texRef, x, y);
surf2Dwrite(pixel, surfRef, x * sizeof(uchar4), y);
}
void processTile(uchar4 * dev_input, cudaGraphicsResource_t dev_output, unsigned int width, unsigned int height)
{
cudaError_t cudaStatus;
// Bind the texture
size_t off = 0;
cudaStatus = cudaBindTexture2D(&off, texRefPtr, dev_input, &texChannelDesc, width, height, width * sizeof(uchar4));
// Map the GL texture
cudaStatus = cudaGraphicsMapResources(1, &dev_output);
cudaArray_t dev_output_array;
cudaStatus = cudaGraphicsSubResourceGetMappedArray(&dev_output_array, dev_output, 0, 0);
cudaStatus = cudaBindSurfaceToArray(surfRef, dev_output_array);
// Launch a kernel on the GPU with one thread for each element.
dim3 blockSize(16, 8, 1);
dim3 gridSize(width / blockSize.x, height / blockSize.y, 1);
processTile<<<gridSize, blockSize>>>(width, height);
// Unmap the GL texture
cudaStatus = cudaGraphicsUnmapResources(1, &dev_output);
// Unbind the texture
cudaStatus = cudaUnbindTexture(texRefPtr);
}
InteropTests.zip (21 KB)