Help needed in creation of a memory buffer to store images

Hi,

I need some help in trying to create a memory buffer that can store two images.
I am not sure how to write a program which is correct because everything I write leads to an infinite loop or creates another problem.

The following is the code which I adapted from the CUDA/OpenGL interop lesson in ‘CUDA-By Example’. Any help in writing the program is REALLY appreciated.

#include “book.h”
#include “cpu_bitmap.h”

#include “cuda.h”
#include <cuda_gl_interop.h>

PFNGLBINDBUFFERARBPROC glBindBuffer = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers = NULL;
PFNGLGENBUFFERSARBPROC glGenBuffers = NULL;
PFNGLBUFFERDATAARBPROC glBufferData = NULL;

#define DIM 512

#define IMAGESIZE_MAX (DIM*DIM)

GLuint bufferObj;
cudaGraphicsResource *resource;

// based on ripple code, but uses uchar4 which is the type of data
// graphic inter op uses. see screenshot - basic2.png

global void kernel( uchar4 *ptr1) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x ;

// now calculate the value at that position
float fx = x/(float)DIM - 0.5f;
float fy = y/(float)DIM - 0.5f;
unsigned char   green = 128 + 127 * tan( abs(fx*100) - abs(fy*100) );

// accessing uchar4 vs unsigned char*
ptr1[offset].x = 0;
ptr1[offset].y = green;
ptr1[offset].z = 0;
ptr1[offset].w = 255;

}

global void kernel2( uchar4 *ptr2) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x ;

// now calculate the value at that position
float fx = x/(float)DIM - 0.5f;
float fy = y/(float)DIM - 0.5f;
unsigned char   green = 128 + 127 * tan( abs(fx*100) - abs(fy*100) );

// accessing uchar4 vs unsigned char*
ptr2[offset].x = 1000;
ptr2[offset].y = green;
ptr2[offset].z = 0;
ptr2[offset].w = 255;

}

void key_func( unsigned char key, int x, int y ) {
switch (key) {
case 27:
// clean up OpenGL and CUDA
( cudaGraphicsUnregisterResource( resource ) );
glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
glDeleteBuffers( 1, &bufferObj );
exit(0);
}
}

void draw_func( void ) {
// we pass zero as the last parameter, because out bufferObj is now
// the source, and the field switches from being a pointer to a
// bitmap to now mean an offset into a bitmap object
glDrawPixels( DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
glutSwapBuffers();
}

int main( int argc, char **argv ) {
cudaDeviceProp prop;
int dev;

memset( &prop, 0, sizeof( cudaDeviceProp ) );
prop.major = 1;
prop.minor = 0;
( cudaChooseDevice( &dev, &prop ) );

// tell CUDA which dev we will be using for graphic interop
// from the programming guide:  Interoperability with OpenGL
//     requires that the CUDA device be specified by
//     cudaGLSetGLDevice() before any other runtime calls.

( cudaGLSetGLDevice( dev ) );

// these GLUT calls need to be made before the other OpenGL
// calls, else we get a seg fault
glutInit( &argc, argv );
glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
glutInitWindowSize( DIM, DIM );
glutCreateWindow( "bitmap" );

glBindBuffer    = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
glGenBuffers    = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
glBufferData    = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");

// the first three are standard OpenGL, the 4th is the CUDA reg 
// of the bitmap these calls exist starting in OpenGL 1.5
glGenBuffers( 1, &bufferObj );
glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4 ,
              NULL, GL_DYNAMIC_DRAW_ARB );

// REGISTER THE GL BufferObj and CUDA Resource

( cudaGraphicsGLRegisterBuffer( &resource, 
                                  bufferObj, 
                                  cudaGraphicsMapFlagsNone ) );

// do work with the memory dst being on the GPU, gotten via mapping
HANDLE_ERROR( cudaGraphicsMapResources( 1, &resource, NULL ) );


uchar4* devPtr;
size_t  size ;

cudaMalloc ( (uchar4 **)&devPtr,  size); 

uchar4 *devPtr2; 

cudaMalloc ( (uchar4 **)&devPtr2,  size); 

uchar4 *pBuffer;

(cudaMalloc ( (uchar4 **)&pBuffer,  size));

uchar4 *pBufferCurrent;

(cudaMalloc ( (uchar4 **)&pBuffer,  size));


uchar4 *pBufferImage;
(cudaMalloc ( (uchar4 **)&pBufferImage,  size));


HANDLE_ERROR( 
    cudaGraphicsResourceGetMappedPointer( (void**)&pBufferImage, 
                                          &size, 
                                          resource) );

dim3    grids(DIM/16,DIM/16);
dim3    threads(16,16);
kernel<<<grids,threads>>>( devPtr );

// kernel2<<<grids,threads>>>(devPtr2);
int a = 1;
do
{

cudaMemcpy ( pBufferImage, devPtr, size, cudaMemcpyDeviceToDevice);

a++;

} while (a==1); 



   // REGISTER THE C BUFFER and CUDA Resource

HANDLE_ERROR ( cudaGraphicsUnmapResources( 1, &resource, NULL ) );

// set up GLUT and kick off main loop
glutKeyboardFunc( key_func );
glutDisplayFunc( draw_func );
glutMainLoop();

}