Yet another question about CUDA + OpenGl

Hello,

I am trying to combine my code with OpenGL in order to visualize in real time the configurations I generate with CUDA. I am using some data structure which I am trying ot integrate with the example I found the book “CUDA by Example”, chapter 8. I am able to generate one image, but I am not able to do a set of plots. I can only see the last configuration. Below is my code. For testing purposes I have a bunch of “particles” which just oscillate. At every iteration a density profile is created using the host function animate, with three calls (update particle position, create the field and copy to gpu ram, and map to pixels using a cuda kernel). I am also interested if it possible to combine it with some free programs ĺike blender or something similar.

(The include files are from the book https://developer.nvidia.com/cuda-example )

//rm a.out; nvcc -lGL -lGLU  -lglut -O2 -lcufft -arch=sm_30 SPH.cu; optirun ./a.out
//rm a.out; nvcc -lGL -lGLU  -lglut  -O2 -lcufft -arch=sm_30 SPH.cu ; ./a.out 

#include <cstdio>
#include <cmath>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "cpu_bitmap.h"

#include "cuda_gl_interop.h"

# define bps 4
# define tpbrho 512
# define Ngas 256
# define Nsed 16
#define     DIM    512

PFNGLBINDBUFFERARBPROC    glBindBuffer     = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers  = NULL;
PFNGLGENBUFFERSARBPROC    glGenBuffers     = NULL;
PFNGLBUFFERDATAARBPROC    glBufferData     = NULL;

GLuint  bufferObj;
cudaGraphicsResource *resource;

typedef struct {
   double *h;
   double *d;
} doublevector;

typedef struct {
   doublevector xgas; //x-component of the position of gas particles
   doublevector ygas; //y-component of the position of gas particles
   doublevector zgas; //z-component of the position of gas particles
   doublevector rhogas; //z-component of the position of gas particles
  
} field;

typedef struct {
   double omega;
   double dx;
   double dy;
   double hgas;
   double mgas;
   double rhoavegas;
   int lx;
   int ly;
   int total_size;
} modelparams;

__host__ void fname(char *conf,const int sss)
{
    
    if(sss<10)
    {
    sprintf(conf,"psi0000%d.txt",sss);
    }
    else if(sss<100)
    {
    sprintf(conf,"psi000%d.txt",sss);
    }
    else if(sss<1000)
    {
    sprintf(conf,"psi00%d.txt",sss);
    } 
    else if(sss<10000)
    {
    sprintf(conf,"psi0%d.txt",sss);
    } 
    else if(sss<100000)
    {
    sprintf(conf,"psi%d.txt",sss);
    } 
}

__host__ void parameterinit(modelparams *parameters)
{   
    
   double pi=acos(-1.0);
   
   parameters[0].dx=0.1; 
   parameters[0].dy=0.1; 
   parameters[0].hgas=1.0; 
   parameters[0].rhoavegas=15.0; //average vañue of the gas density
   parameters[0].mgas=(parameters[0].rhoavegas*parameters[0].total_size*parameters[0].dx*parameters[0].dy)/(double)Ngas; // individual mass of a gas gas particle
    
}
__host__ void pointerinit(field *gassed,modelparams *parameters)
{  
     
   parameterinit(parameters);
   gassed[0].xgas.h=(new double[Ngas]);
   gassed[0].ygas.h=(new double[Ngas]);
   gassed[0].zgas.h=(new double[Ngas]);  
   cudaMalloc((void**)&gassed[0].xgas.d, sizeof(double)*(Ngas));
   cudaMalloc((void**)&gassed[0].ygas.d, sizeof(double)*(Ngas));
   cudaMalloc((void**)&gassed[0].zgas.d, sizeof(double)*(Ngas)); 
   
   gassed[0].rhogas.h=(new double[parameters[0].total_size]);  
   cudaMalloc((void**)&gassed[0].rhogas.d, sizeof(double)*(parameters[0].total_size)); 
}

static void key_func( unsigned char key, int x, int y ) {
    switch (key) {
        case 27:
            // clean up OpenGL and CUDA
            ( cudaGraphicsUnregisterResource( resource ) );
            glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
            glDeleteBuffers( 1, &bufferObj );
            exit(0);
    }
}

static void draw_func( void ) {
    // we pass zero as the last parameter, because out bufferObj is now
    // the source, and the field switches from being a pointer to a
    // bitmap to now mean an offset into a bitmap object
    glDrawPixels( DIM,DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
    glutSwapBuffers();
}

__device__ unsigned char value( float n1, float n2, int hue ) {
    if (hue > 360)      hue -= 360;
    else if (hue < 0)   hue += 360;

    if (hue < 60)
        return (unsigned char)(255 * (n1 + (n2-n1)*hue/60));
    if (hue < 180)
        return (unsigned char)(255 * n2);
    if (hue < 240)
        return (unsigned char)(255 * (n1 + (n2-n1)*(240-hue)/60));
    return (unsigned char)(255 * n1);
}

__global__ void kernel( uchar4 *optr, double *rho, double nnn,double mmm ) {
    // map from threadIdx/BlockIdx to pixel position
    int ibl=blockIdx.y+blockIdx.x*gridDim.y;
    int offset=threadIdx.x+blockDim.x*ibl;
//    int y=offset/gridDim.x;
//    int x=offset-y*gridDim.x;
    
    if(offset<DIM*DIM){
      float l = 1-(rho[offset]-nnn)/(mmm-nnn);
      float s = 1;
      int h = (180 + (int)(360.0f * rho[offset])) % 360;
      float m1, m2;

      if (l <= 0.5f)
            m2 = l * (1 + s);
      else
            m2 = l + s - l * s;
      m1 = 2 * l - m2;

      optr[offset].x = value( m1, m2, h+120 );
      optr[offset].y = value( m1, m2, h );
      optr[offset].z = value( m1, m2, h -120 );
      optr[offset].w = 255;
    }
}

__host__ void getrho(field gassed,modelparams parameters)
{
   double pi=acos(-1.0);

   int lx=parameters.lx;
   int ly=parameters.ly;
   int total_size=parameters.total_size;
   double dx=parameters.dx;
   double dy=parameters.dy;
   double h=parameters.hgas;
   int ihx=floor(h/dx)+1,ihy=floor(h/dy)+1;
   
   for(int crho=0;crho<lx*ly;crho++){
      gassed.rhogas.h[crho]=0.0;
   }
   double x,y;
   
   for(int igas=0;igas<Ngas;igas++){
      x=gassed.xgas.h[igas];
      y=gassed.ygas.h[igas];
      int ii=x/dx;
      int jj=y/dy;
//      printf("\n %d %f %f",igas, x,y);
      for (int xi=ii-ihx;xi<=ii+ihx;xi++){
            int ix=xi;
            ix=(ix<0?ix+lx:ix); // periodic boudnary conditions
            ix=(ix>=lx?ix-lx:ix);
            for(int yj=jj-ihy;yj<=jj+ihy;yj++){
                  int iy=yj;
                  iy=(iy<0?iy+ly:iy);
                  iy=(iy>=ly?iy-ly:iy);
                  double rr=sqrt(pow((x-(xi+0.5)*dx),2)+pow(y-((yj+0.5)*dy),2));
                  double wrh=0;
                  if(rr/h<=0.5){
                        wrh=8.0/(pi*pow(h,3))*(1-6.0*pow((rr/h),2)+6.0*pow((rr/h),3));
                  }
                  if(rr/h>0.5 && rr/h<=1.0){
                        wrh=(8.0/(pi*pow(h,3)))*2.0*pow((1-rr/h),3);
                  }
                  int crho=iy+ix*ly;
                  //gassed.rhogas.h[crho]=1.0;
                  gassed.rhogas.h[crho]=gassed.rhogas.h[crho]+wrh;
                  //if(x>
                  //printf("\n %d %d %d %d %d %f",igas, ii,jj,xi,yj,wrh);
   }}}
   double maxrho=0.0;
   double minrho=100.0;
   for(int crho=0;crho<lx*ly;crho++){
      if(gassed.rhogas.h[crho]>maxrho) maxrho=gassed.rhogas.h[crho];
      if(gassed.rhogas.h[crho]<minrho) minrho=gassed.rhogas.h[crho];
   }
   printf("\n MAXrho= %lf  MINrho= %lf \n",maxrho,minrho);

   cudaMemcpy(gassed.rhogas.d,gassed.rhogas.h, sizeof(double)*(total_size),cudaMemcpyHostToDevice) ;
//   printf("\n The gas dummy local density was initialized. The data was copied from host to device \n"); 
}   

__host__ void sphoscillatory(field *gassed,modelparams *parameters,int ticks)
{  
   double pi=acos(-1.0);

   double omega=parameters[0].omega;
   int lx=parameters[0].lx;
   int ly=parameters[0].ly;
//   int total_size=parameters[0].total_size;
   double dx=parameters[0].dx;
   double dy=parameters[0].dy;
   double h=parameters[0].hgas;
   int ihx=floor(h/dx)+1,ihy=floor(h/dy)+1;
   
   int igas=0;
   for(int ii=0; ii<16;ii++){
      for(int jj=0;jj<16;jj++){
            double x=ii*((lx*dx)/16.0)+0.3*h*cos((1+igas)*ticks*2.0*pi/omega);
            double y=jj*((ly*dy)/16.0)+0.3*h*sin((1+igas)*ticks*2.0*pi/omega);
            gassed[0].xgas.h[igas]=x;
            gassed[0].ygas.h[igas]=y;
            igas++;
   }}
//   printf("\n igas= %d Ngas= %d ihx= %d ihy= %d\n", igas,  Ngas,ihx,ihy);
   cudaMemcpy(gassed[0].xgas.d,gassed[0].xgas.h, sizeof(double)*(Ngas),cudaMemcpyHostToDevice) ;
   printf("\n The gas sph particles were initialized. The data was copied from host to device \n");  
}

__host__ void animate(uchar4* devPtr,field *gassed,modelparams *parameters,int t)
{
   double pi=acos(-1.0);

   int lx=parameters[0].lx;
   int ly=parameters[0].ly;
   sphoscillatory(gassed,parameters,t);
   getrho(gassed[0],parameters[0]);
   kernel<<<(lx*ly+512-1)/512,512>>>( devPtr, gassed[0].rhogas.d,0.0,2.475487);
}

int main( int argc, char **argv )
{
   cudaSetDevice(0);
   size_t free, total;
   printf("\n");
   cudaMemGetInfo(&free,&total);   
   printf("%lu KB free of total %lu KB at the beginning\n",(free/1024),(total/1024));
   const int lx=DIM,ly=lx,total_size=lx*ly;
   dim3 gridrho,threadsrho;
//   FILE * pFile;
//   char conf[13];

   threadsrho.x=tpbrho;
   threadsrho.y=1;
   threadsrho.z=1;
   gridrho.x=(int)ceil((double)sqrt(total_size)/sqrt((double)threadsrho.x)); 
   gridrho.y=(int)ceil((double)(total_size)/((double)threadsrho.x*(double)gridrho.x))+1;
   gridrho.z=1;  
   printf("\n no of blocks %d, no of threades per block %d,\n total no of threads %d, total size=lx*ly %d\n", gridrho.x*gridrho.y,threadsrho.x,gridrho.x*gridrho.y*threadsrho.x,total_size);
   field gassed[1];
   modelparams parameters[1];
   parameters[0].omega=53;
   parameters[0].lx=lx;
   parameters[0].ly=ly;
   parameters[0].total_size=total_size;
   pointerinit(gassed,parameters);
   
   cudaMemGetInfo(&free,&total);   
   printf("\n%lu KB free of total %lu KB after all memory allocations\n",(free/1024),(total/1024));
   cudaGLSetGLDevice(0);
   // these GLUT calls need to be made before the other GL calls
   glutInit( &argc, argv );
   glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
   glutInitWindowSize( DIM, DIM );
   glutCreateWindow( "Test" );

glBindBuffer    = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
   glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
   glGenBuffers    = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
   glBufferData    = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");

    // the first three are standard OpenGL, the 4th is the CUDA reg 
    // of the bitmap these calls exist starting in OpenGL 1.5
   glGenBuffers( 1, &bufferObj );
   glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
   glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4, NULL, GL_DYNAMIC_DRAW_ARB );

   cudaGraphicsGLRegisterBuffer( &resource, bufferObj,  cudaGraphicsMapFlagsNone ) ;

    // do work with the memory dst being on the GPU, gotten via mapping
    ( cudaGraphicsMapResources( 1, &resource, NULL ) );
   uchar4* devPtr;
   size_t  size;
   cudaGraphicsResourceGetMappedPointer( (void**)&devPtr,&size, resource);
   for(int t=0;t<1000; t++){
        animate(devPtr,gassed,parameters,t);
    // set up GLUT and kick off main loop
        cudaGraphicsUnmapResources( 1, &resource, NULL );

   }
   glutKeyboardFunc( key_func );
   glutDisplayFunc( draw_func );
   glutMainLoop();
}

Please help!

This won’t work as you already know.

You are creating all your animations, then you are calling the display process after all that is done.

You’ll need to modify your draw_func to do something like this:

-map CUDA/OGL resource
-run your animate function (once)
-unmap resource
-draw pixels
-swap buffers

you’ll also want a timer function to control the rate at which your animation proceeds.

Study the CUDA/OGL interop sample codes to get a a general idea of the structure and flow of such an animation. For example the simpleGL sample code.

Thanks I will start with the draw function and try to adapted to my problem.

Hello,

I manged to make it work. I have no idea why it works and how it works, but maybe it can be of help for others who just want to do simple animations for their simulations. Also I appreciate any suggestions to clean the code, to make it looks nicer. In particular the opengl related parts. I have never used include with my own files. I find pretty annoying that I had to define global pointers to use for the drawdraw function, because no arguments are permitted by glutDisplayFunc. The examples I in quest to avoid global variables for the draw function used by glutDisplayFunc suggest to use a struct, but the examples I saw were for C++ and I have no idea how to use in my code.

Here the two relevant fucntions:

static void drawdraw( void) {
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    // we pass zero as the last parameter, because out bufferObj is now
    // the source, and the field switches from being a pointer to a
    // bitmap to now mean an offset into a bitmap object
    animate(globalgassed,globalparameters,g_fAnim);

    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
    glDrawPixels( DIM,DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
    glutSwapBuffers();
    g_fAnim +=0.01f;
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&eltime, start, stop);
    computeFPS();
    
}

void initglut(int argc, char **argv){
cudaGLSetGLDevice(0);
   // these GLUT calls need to be made before the other GL calls
   glutInit( &argc, argv );
   glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
   glutInitWindowSize( DIM, DIM );
   glutCreateWindow( "Test" );
   glutDisplayFunc( drawdraw );
   glutKeyboardFunc( key_func );
   
   glutTimerFunc(REFRESH_DELAY, timerEvent,0);
   
   glBindBuffer    = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
   glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
   glGenBuffers    = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
   glBufferData    = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");

    // the first three are standard OpenGL, the 4th is the CUDA reg 
    // of the bitmap these calls exist starting in OpenGL 1.5
   glGenBuffers( 1, &bufferObj );
   glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
   glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4, NULL, GL_DYNAMIC_DRAW_ARB );

   cudaGraphicsGLRegisterBuffer( &resource, bufferObj,  cudaGraphicsMapFlagsNone ) ;
}
//rm a.out; nvcc -lGL -lGLU  -lglut -O2 -lcufft -arch=sm_30 SPH.cu; optirun ./a.out
//rm a.out; nvcc -lGL -lGLU  -lglut  -O2 -lcufft -arch=sm_30 SPH.cu ; ./a.out 

#include <cstdio>
#include <cmath>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include "gl_helper.h"
#include "cuda_gl_interop.h"

# define bps 4
# define tpbrho 512
# define Ngas 256
# define Nsed 16
#define     DIM    512

#define MAX_EPSILON_ERROR 10.0f
#define THRESHOLD          0.30f
#define REFRESH_DELAY     10 //ms
#define MAX(a,b) ((a > b) ? a : b)

PFNGLBINDBUFFERARBPROC    glBindBuffer     = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers  = NULL;
PFNGLGENBUFFERSARBPROC    glGenBuffers     = NULL;
PFNGLBUFFERDATAARBPROC    glBufferData     = NULL;

GLuint  bufferObj;
cudaGraphicsResource *resource;
float g_fAnim = 0.0;

// mouse controls
int mouse_old_x, mouse_old_y;
int mouse_buttons = 0;
float rotate_x = 0.0, rotate_y = 0.0;
float translate_z = -3.0;

// Auto-Verification Code
int fpsCount = 0;        // FPS count for averaging
int fpsLimit = 1;        // FPS limit for sampling
int g_Index = 0;
float avgFPS = 0.0f;
unsigned int frameCount = 0;
unsigned int g_TotalErrors = 0;
float eltime;

typedef struct {
   double *h;
   double *d;
} doublevector;

typedef struct {
   doublevector xgas; //x-component of the position of gas particles
   doublevector ygas; //y-component of the position of gas particles
   doublevector zgas; //z-component of the position of gas particles
   doublevector rhogas; //z-component of the position of gas particles
   
/*
   doublevector vxgas; //x-component of the velocity of gas particles
   doublevector vygas; //y-component of the velocity of gas particles
   doublevector vzgas; //z-component of the velocity of gas particles
   doublevector hgas; // spread of gas particles
   doublevector xcol; //x-component of the position of sediment particles
   doublevector ycol; //y-component of the position of sediment particles
   doublevector zcol; //z-component of the position of sediment particles
   doublevector vxcol; //x-component of the velocity of sediment particles
   doublevector vycol; //y-component of the velocity of sediment particles
   doublevector vzcol; //z-component of the velocity of sediment particles
   doublevector rcol; // radius of the sediments. Spheres only
   doublevector xFpf; // x-component of particle-fluid interaction
   doublevector yFpf; // y-component of particle-fluid interaction
   doublevector zFpf; // z-component of particle-fluid interaction
*/
   
} field;

typedef struct {
   double omega;
   double dx;
   double dy;
   double hgas;
   double mgas;
   double rhoavegas;
   int lx;
   int ly;
   int total_size;
} modelparams;

field globalgassed[1];
modelparams globalparameters[1];
__host__ void fname(char *conf,const int sss)
{
    
    if(sss<10)
    {
    sprintf(conf,"psi0000%d.txt",sss);
    }
    else if(sss<100)
    {
    sprintf(conf,"psi000%d.txt",sss);
    }
    else if(sss<1000)
    {
    sprintf(conf,"psi00%d.txt",sss);
    } 
    else if(sss<10000)
    {
    sprintf(conf,"psi0%d.txt",sss);
    } 
    else if(sss<100000)
    {
    sprintf(conf,"psi%d.txt",sss);
    } 
}

__host__ void parameterinit(modelparams *parameters)
{   
    
   double pi=acos(-1.0);
   
   parameters[0].dx=0.1; 
   parameters[0].dy=0.1; 
   parameters[0].hgas=1.0; 
   parameters[0].rhoavegas=15.0; //average vañue of the gas density
   parameters[0].mgas=(parameters[0].rhoavegas*parameters[0].total_size*parameters[0].dx*parameters[0].dy)/(double)Ngas; // individual mass of a gas gas particle
    
}
__host__ void pointerinit(field *gassed,modelparams *parameters)
{  
     
   parameterinit(parameters);
   gassed[0].xgas.h=(new double[Ngas]);
   gassed[0].ygas.h=(new double[Ngas]);
   gassed[0].zgas.h=(new double[Ngas]);  
   cudaMalloc((void**)&gassed[0].xgas.d, sizeof(double)*(Ngas));
   cudaMalloc((void**)&gassed[0].ygas.d, sizeof(double)*(Ngas));
   cudaMalloc((void**)&gassed[0].zgas.d, sizeof(double)*(Ngas)); 
   
   gassed[0].rhogas.h=(new double[parameters[0].total_size]);  
   cudaMalloc((void**)&gassed[0].rhogas.d, sizeof(double)*(parameters[0].total_size)); 
   globalgassed[0]=gassed[0];
   globalparameters[0]=parameters[0];
}

static void key_func( unsigned char key, int x, int y ) {
    switch (key) {
        case 27:
            // clean up OpenGL and CUDA
            ( cudaGraphicsUnregisterResource( resource ) );
            glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
            glDeleteBuffers( 1, &bufferObj );
            exit(0);
    }
}
/*
static void draw_func( void ) {
    // we pass zero as the last parameter, because out bufferObj is now
    // the source, and the field switches from being a pointer to a
    // bitmap to now mean an offset into a bitmap object
    glDrawPixels( DIM,DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
    glutSwapBuffers();
}

*/
__device__ unsigned char value( float n1, float n2, int hue ) {
    if (hue > 360)      hue -= 360;
    else if (hue < 0)   hue += 360;

    if (hue < 60)
        return (unsigned char)(255 * (n1 + (n2-n1)*hue/60));
    if (hue < 180)
        return (unsigned char)(255 * n2);
    if (hue < 240)
        return (unsigned char)(255 * (n1 + (n2-n1)*(240-hue)/60));
    return (unsigned char)(255 * n1);
}

__global__ void kernel( uchar4 *optr, double *rho, double nnn,double mmm ) {
    // map from threadIdx/BlockIdx to pixel position
    int ibl=blockIdx.y+blockIdx.x*gridDim.y;
    int offset=threadIdx.x+blockDim.x*ibl;
//    int y=offset/gridDim.x;
//    int x=offset-y*gridDim.x;
    
    if(offset<DIM*DIM){
      float l = 1-(rho[offset]-nnn)/(mmm-nnn);
      float s = 1;
      int h = (180 + (int)(360.0f * rho[offset])) % 360;
      float m1, m2;

      if (l <= 0.5f)
            m2 = l * (1 + s);
      else
            m2 = l + s - l * s;
      m1 = 2 * l - m2;

      optr[offset].x = value( m1, m2, h+120 );
      optr[offset].y = value( m1, m2, h );
      optr[offset].z = value( m1, m2, h -120 );
      optr[offset].w = 255;
    }
}

__host__ void getrho(field gassed,modelparams parameters)
{
   double pi=acos(-1.0);

   int lx=parameters.lx;
   int ly=parameters.ly;
   int total_size=parameters.total_size;
   double dx=parameters.dx;
   double dy=parameters.dy;
   double h=parameters.hgas;
   int ihx=floor(h/dx)+1,ihy=floor(h/dy)+1;
   
   for(int crho=0;crho<lx*ly;crho++){
      gassed.rhogas.h[crho]=0.0;
   }
   double x,y;
   
   for(int igas=0;igas<Ngas;igas++){
      x=gassed.xgas.h[igas];
      y=gassed.ygas.h[igas];
      int ii=x/dx;
      int jj=y/dy;
//      printf("\n %d %f %f",igas, x,y);
      for (int xi=ii-ihx;xi<=ii+ihx;xi++){
            int ix=xi;
            ix=(ix<0?ix+lx:ix); // periodic boudnary conditions
            ix=(ix>=lx?ix-lx:ix);
            for(int yj=jj-ihy;yj<=jj+ihy;yj++){
                  int iy=yj;
                  iy=(iy<0?iy+ly:iy);
                  iy=(iy>=ly?iy-ly:iy);
                  double rr=sqrt(pow((x-(xi+0.5)*dx),2)+pow(y-((yj+0.5)*dy),2));
                  double wrh=0;
                  if(rr/h<=0.5){
                        wrh=8.0/(pi*pow(h,3))*(1-6.0*pow((rr/h),2)+6.0*pow((rr/h),3));
                  }
                  if(rr/h>0.5 && rr/h<=1.0){
                        wrh=(8.0/(pi*pow(h,3)))*2.0*pow((1-rr/h),3);
                  }
                  int crho=iy+ix*ly;
                  //gassed.rhogas.h[crho]=1.0;
                  gassed.rhogas.h[crho]=gassed.rhogas.h[crho]+wrh;
                  //if(x>
                  //printf("\n %d %d %d %d %d %f",igas, ii,jj,xi,yj,wrh);
   }}}
   double maxrho=0.0;
   double minrho=100.0;
   for(int crho=0;crho<lx*ly;crho++){
      if(gassed.rhogas.h[crho]>maxrho) maxrho=gassed.rhogas.h[crho];
      if(gassed.rhogas.h[crho]<minrho) minrho=gassed.rhogas.h[crho];
   }
   printf("\n MAXrho= %lf  MINrho= %lf \n",maxrho,minrho);

   cudaMemcpy(gassed.rhogas.d,gassed.rhogas.h, sizeof(double)*(total_size),cudaMemcpyHostToDevice) ;
//   printf("\n The gas dummy local density was initialized. The data was copied from host to device \n"); 
}   

__host__ void sphinit(field gassed,modelparams parameters)
{  
   double pi=acos(-1.0);

   int lx=parameters.lx;
   int ly=parameters.ly;
//   int total_size=parameters.total_size;
   double dx=parameters.dx;
   double dy=parameters.dy;
   double h=parameters.hgas;
   int ihx=floor(h/dx)+1,ihy=floor(h/dy)+1;
   
   int igas=0;
   for(int ii=0; ii<16;ii++){
      for(int jj=0;jj<16;jj++){
            double x=ii*((lx*dx)/16.0);
            double y=jj*((ly*dy)/16.0);
            gassed.xgas.h[igas]=x;
            gassed.ygas.h[igas]=y;
            igas++;
   }}
   printf("\n igas= %d Ngas= %d ihx= %d ihy= %d\n", igas,  Ngas,ihx,ihy);
   cudaMemcpy(gassed.xgas.d,gassed.xgas.h, sizeof(double)*(Ngas),cudaMemcpyHostToDevice) ;
//   printf("\n The gas sph particles were initialized. The data was copied from host to device \n");  
}
__host__ void sphoscillatory(uchar4* devPtr,field *gassed,modelparams *parameters,float ticks)
{  
   double pi=acos(-1.0);

   double omega=parameters[0].omega;
   int lx=parameters[0].lx;
   int ly=parameters[0].ly;
//   int total_size=parameters[0].total_size;
   double dx=parameters[0].dx;
   double dy=parameters[0].dy;
   double h=parameters[0].hgas;
   int ihx=floor(h/dx)+1,ihy=floor(h/dy)+1;
   
   int igas=0;
   for(int ii=0; ii<16;ii++){
      for(int jj=0;jj<16;jj++){
            double x=ii*((lx*dx)/16.0)+0.3*h*cos((1+igas)*ticks*2.0*pi/omega);
            double y=jj*((ly*dy)/16.0)+0.3*h*sin((1+igas)*ticks*2.0*pi/omega);
            gassed[0].xgas.h[igas]=x;
            gassed[0].ygas.h[igas]=y;
            igas++;
   }}
//   printf("\n igas= %d Ngas= %d ihx= %d ihy= %d\n", igas,  Ngas,ihx,ihy);
   cudaMemcpy(gassed[0].xgas.d,gassed[0].xgas.h, sizeof(double)*(Ngas),cudaMemcpyHostToDevice) ;
   printf("\n The gas sph particles were initialized. The data was copied from host to device \n");
   getrho(gassed[0],parameters[0]);  
   kernel<<<(lx*ly+512-1)/512,512>>>( devPtr, gassed[0].rhogas.d,0.0,2.475487);
}

void timerEvent(int value)
{
    if (glutGetWindow())
    {
        glutPostRedisplay();
        glutTimerFunc(REFRESH_DELAY, timerEvent,0);
    }
}

void computeFPS()
{
    frameCount++;
    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        avgFPS = 1.f / (eltime/ 1000.f);
        fpsCount = 0;
        fpsLimit = (int)MAX(avgFPS, 1.f);

    }

    char fps[256];
    sprintf(fps, "Cuda GL Interop (VBO): %3.1f fps (Max 100Hz)", avgFPS);
    glutSetWindowTitle(fps);
}

__host__ void animate(field *gassed,modelparams *parameters,float ttt){
   uchar4* devPtr;
   cudaGraphicsMapResources( 1, &resource, NULL );
   size_t  size;
   cudaGraphicsResourceGetMappedPointer( (void**)&devPtr,&size, resource);
   
   double pi=acos(-1.0);

//   int lx=parameters[0].lx;
//   int ly=parameters[0].ly;
   sphoscillatory(devPtr,gassed,parameters, ttt);
   cudaGraphicsUnmapResources( 1, &resource, NULL );
}

static void drawdraw( void) {
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    // we pass zero as the last parameter, because out bufferObj is now
    // the source, and the field switches from being a pointer to a
    // bitmap to now mean an offset into a bitmap object
    animate(globalgassed,globalparameters,g_fAnim);

    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
    glDrawPixels( DIM,DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
    glutSwapBuffers();
    g_fAnim +=0.01f;
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&eltime, start, stop);
    computeFPS();
    
}

void initglut(int argc, char **argv){
cudaGLSetGLDevice(0);
   // these GLUT calls need to be made before the other GL calls
   glutInit( &argc, argv );
   glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
   glutInitWindowSize( DIM, DIM );
   glutCreateWindow( "Test" );
   glutDisplayFunc( drawdraw );
   glutKeyboardFunc( key_func );
   
   glutTimerFunc(REFRESH_DELAY, timerEvent,0);
   
   glBindBuffer    = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
   glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
   glGenBuffers    = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
   glBufferData    = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");

    // the first three are standard OpenGL, the 4th is the CUDA reg 
    // of the bitmap these calls exist starting in OpenGL 1.5
   glGenBuffers( 1, &bufferObj );
   glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
   glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4, NULL, GL_DYNAMIC_DRAW_ARB );

   cudaGraphicsGLRegisterBuffer( &resource, bufferObj,  cudaGraphicsMapFlagsNone ) ;
}   
int main( int argc, char **argv )
{
   cudaSetDevice(0);
   size_t free, total;
   printf("\n");
   cudaMemGetInfo(&free,&total);   
   printf("%lu KB free of total %lu KB at the beginning\n",(free/1024),(total/1024));
   const int lx=DIM,ly=lx,total_size=lx*ly;
   dim3 gridrho,threadsrho;
//   FILE * pFile;
//   char conf[13];

   threadsrho.x=tpbrho;
   threadsrho.y=1;
   threadsrho.z=1;
   gridrho.x=(int)ceil((double)sqrt(total_size)/sqrt((double)threadsrho.x)); 
   gridrho.y=(int)ceil((double)(total_size)/((double)threadsrho.x*(double)gridrho.x))+1;
   gridrho.z=1;  
   printf("\n no of blocks %d, no of threades per block %d,\n total no of threads %d, total size=lx*ly %d\n", gridrho.x*gridrho.y,threadsrho.x,gridrho.x*gridrho.y*threadsrho.x,total_size);
   field gassed[1];
   modelparams parameters[1];
   parameters[0].omega=53;
   parameters[0].lx=lx;
   parameters[0].ly=ly;
   parameters[0].total_size=total_size;
   pointerinit(gassed,parameters);
   
   cudaMemGetInfo(&free,&total);   
   printf("\n%lu KB free of total %lu KB after all memory allocations\n",(free/1024),(total/1024));
   
   initglut(argc,argv);

glutMainLoop();

}

Hello,

I manged to make it work. I have no idea why it works and how it works, but maybe it can be of help for others who just want to do simple animations for their simulations. Also I appreciate any suggestions to clean the code, to make it looks nicer. In particular the opengl related parts. I have never used include with my own files. I find pretty annoying that I had to define flobal pointers to use for the drawdraw function, because no arguments are permitted by flutDisplayFunc

Here the two relevant fucntions and the whole code:

static void drawdraw( void) {
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    // we pass zero as the last parameter, because out bufferObj is now
    // the source, and the field switches from being a pointer to a
    // bitmap to now mean an offset into a bitmap object
    animate(globalgassed,globalparameters,g_fAnim);

    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
    glDrawPixels( DIM,DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
    glutSwapBuffers();
    g_fAnim +=0.01f;
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&eltime, start, stop);
    computeFPS();
    
}

void initglut(int argc, char **argv){
cudaGLSetGLDevice(0);
   // these GLUT calls need to be made before the other GL calls
   glutInit( &argc, argv );
   glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
   glutInitWindowSize( DIM, DIM );
   glutCreateWindow( "Test" );
   glutDisplayFunc( drawdraw );
   glutKeyboardFunc( key_func );
   
   glutTimerFunc(REFRESH_DELAY, timerEvent,0);
   
   glBindBuffer    = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
   glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
   glGenBuffers    = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
   glBufferData    = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");

    // the first three are standard OpenGL, the 4th is the CUDA reg 
    // of the bitmap these calls exist starting in OpenGL 1.5
   glGenBuffers( 1, &bufferObj );
   glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
   glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4, NULL, GL_DYNAMIC_DRAW_ARB );

   cudaGraphicsGLRegisterBuffer( &resource, bufferObj,  cudaGraphicsMapFlagsNone ) ;
}
//rm a.out; nvcc -lGL -lGLU  -lglut -O2 -lcufft -arch=sm_30 SPH.cu; optirun ./a.out
//rm a.out; nvcc -lGL -lGLU  -lglut  -O2 -lcufft -arch=sm_30 SPH.cu ; ./a.out 

#include <cstdio>
#include <cmath>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include "gl_helper.h"
#include "cuda_gl_interop.h"

# define bps 4
# define tpbrho 512
# define Ngas 256
# define Nsed 16
#define     DIM    512

#define MAX_EPSILON_ERROR 10.0f
#define THRESHOLD          0.30f
#define REFRESH_DELAY     10 //ms
#define MAX(a,b) ((a > b) ? a : b)

PFNGLBINDBUFFERARBPROC    glBindBuffer     = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers  = NULL;
PFNGLGENBUFFERSARBPROC    glGenBuffers     = NULL;
PFNGLBUFFERDATAARBPROC    glBufferData     = NULL;

GLuint  bufferObj;
cudaGraphicsResource *resource;
float g_fAnim = 0.0;

// mouse controls
int mouse_old_x, mouse_old_y;
int mouse_buttons = 0;
float rotate_x = 0.0, rotate_y = 0.0;
float translate_z = -3.0;

// Auto-Verification Code
int fpsCount = 0;        // FPS count for averaging
int fpsLimit = 1;        // FPS limit for sampling
int g_Index = 0;
float avgFPS = 0.0f;
unsigned int frameCount = 0;
unsigned int g_TotalErrors = 0;
float eltime;

typedef struct {
   double *h;
   double *d;
} doublevector;

typedef struct {
   doublevector xgas; //x-component of the position of gas particles
   doublevector ygas; //y-component of the position of gas particles
   doublevector zgas; //z-component of the position of gas particles
   doublevector rhogas; //z-component of the position of gas particles
   
/*
   doublevector vxgas; //x-component of the velocity of gas particles
   doublevector vygas; //y-component of the velocity of gas particles
   doublevector vzgas; //z-component of the velocity of gas particles
   doublevector hgas; // spread of gas particles
   doublevector xcol; //x-component of the position of sediment particles
   doublevector ycol; //y-component of the position of sediment particles
   doublevector zcol; //z-component of the position of sediment particles
   doublevector vxcol; //x-component of the velocity of sediment particles
   doublevector vycol; //y-component of the velocity of sediment particles
   doublevector vzcol; //z-component of the velocity of sediment particles
   doublevector rcol; // radius of the sediments. Spheres only
   doublevector xFpf; // x-component of particle-fluid interaction
   doublevector yFpf; // y-component of particle-fluid interaction
   doublevector zFpf; // z-component of particle-fluid interaction
*/
   
} field;

typedef struct {
   double omega;
   double dx;
   double dy;
   double hgas;
   double mgas;
   double rhoavegas;
   int lx;
   int ly;
   int total_size;
} modelparams;

field globalgassed[1];
modelparams globalparameters[1];
__host__ void fname(char *conf,const int sss)
{
    
    if(sss<10)
    {
    sprintf(conf,"psi0000%d.txt",sss);
    }
    else if(sss<100)
    {
    sprintf(conf,"psi000%d.txt",sss);
    }
    else if(sss<1000)
    {
    sprintf(conf,"psi00%d.txt",sss);
    } 
    else if(sss<10000)
    {
    sprintf(conf,"psi0%d.txt",sss);
    } 
    else if(sss<100000)
    {
    sprintf(conf,"psi%d.txt",sss);
    } 
}

__host__ void parameterinit(modelparams *parameters)
{   
    
   double pi=acos(-1.0);
   
   parameters[0].dx=0.1; 
   parameters[0].dy=0.1; 
   parameters[0].hgas=1.0; 
   parameters[0].rhoavegas=15.0; //average vañue of the gas density
   parameters[0].mgas=(parameters[0].rhoavegas*parameters[0].total_size*parameters[0].dx*parameters[0].dy)/(double)Ngas; // individual mass of a gas gas particle
    
}
__host__ void pointerinit(field *gassed,modelparams *parameters)
{  
     
   parameterinit(parameters);
   gassed[0].xgas.h=(new double[Ngas]);
   gassed[0].ygas.h=(new double[Ngas]);
   gassed[0].zgas.h=(new double[Ngas]);  
   cudaMalloc((void**)&gassed[0].xgas.d, sizeof(double)*(Ngas));
   cudaMalloc((void**)&gassed[0].ygas.d, sizeof(double)*(Ngas));
   cudaMalloc((void**)&gassed[0].zgas.d, sizeof(double)*(Ngas)); 
   
   gassed[0].rhogas.h=(new double[parameters[0].total_size]);  
   cudaMalloc((void**)&gassed[0].rhogas.d, sizeof(double)*(parameters[0].total_size)); 
   globalgassed[0]=gassed[0];
   globalparameters[0]=parameters[0];
}

static void key_func( unsigned char key, int x, int y ) {
    switch (key) {
        case 27:
            // clean up OpenGL and CUDA
            ( cudaGraphicsUnregisterResource( resource ) );
            glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
            glDeleteBuffers( 1, &bufferObj );
            exit(0);
    }
}
/*
static void draw_func( void ) {
    // we pass zero as the last parameter, because out bufferObj is now
    // the source, and the field switches from being a pointer to a
    // bitmap to now mean an offset into a bitmap object
    glDrawPixels( DIM,DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
    glutSwapBuffers();
}

*/
__device__ unsigned char value( float n1, float n2, int hue ) {
    if (hue > 360)      hue -= 360;
    else if (hue < 0)   hue += 360;

    if (hue < 60)
        return (unsigned char)(255 * (n1 + (n2-n1)*hue/60));
    if (hue < 180)
        return (unsigned char)(255 * n2);
    if (hue < 240)
        return (unsigned char)(255 * (n1 + (n2-n1)*(240-hue)/60));
    return (unsigned char)(255 * n1);
}

__global__ void kernel( uchar4 *optr, double *rho, double nnn,double mmm ) {
    // map from threadIdx/BlockIdx to pixel position
    int ibl=blockIdx.y+blockIdx.x*gridDim.y;
    int offset=threadIdx.x+blockDim.x*ibl;
//    int y=offset/gridDim.x;
//    int x=offset-y*gridDim.x;
    
    if(offset<DIM*DIM){
      float l = 1-(rho[offset]-nnn)/(mmm-nnn);
      float s = 1;
      int h = (180 + (int)(360.0f * rho[offset])) % 360;
      float m1, m2;

      if (l <= 0.5f)
            m2 = l * (1 + s);
      else
            m2 = l + s - l * s;
      m1 = 2 * l - m2;

      optr[offset].x = value( m1, m2, h+120 );
      optr[offset].y = value( m1, m2, h );
      optr[offset].z = value( m1, m2, h -120 );
      optr[offset].w = 255;
    }
}

__host__ void getrho(field gassed,modelparams parameters)
{
   double pi=acos(-1.0);

   int lx=parameters.lx;
   int ly=parameters.ly;
   int total_size=parameters.total_size;
   double dx=parameters.dx;
   double dy=parameters.dy;
   double h=parameters.hgas;
   int ihx=floor(h/dx)+1,ihy=floor(h/dy)+1;
   
   for(int crho=0;crho<lx*ly;crho++){
      gassed.rhogas.h[crho]=0.0;
   }
   double x,y;
   
   for(int igas=0;igas<Ngas;igas++){
      x=gassed.xgas.h[igas];
      y=gassed.ygas.h[igas];
      int ii=x/dx;
      int jj=y/dy;
//      printf("\n %d %f %f",igas, x,y);
      for (int xi=ii-ihx;xi<=ii+ihx;xi++){
            int ix=xi;
            ix=(ix<0?ix+lx:ix); // periodic boudnary conditions
            ix=(ix>=lx?ix-lx:ix);
            for(int yj=jj-ihy;yj<=jj+ihy;yj++){
                  int iy=yj;
                  iy=(iy<0?iy+ly:iy);
                  iy=(iy>=ly?iy-ly:iy);
                  double rr=sqrt(pow((x-(xi+0.5)*dx),2)+pow(y-((yj+0.5)*dy),2));
                  double wrh=0;
                  if(rr/h<=0.5){
                        wrh=8.0/(pi*pow(h,3))*(1-6.0*pow((rr/h),2)+6.0*pow((rr/h),3));
                  }
                  if(rr/h>0.5 && rr/h<=1.0){
                        wrh=(8.0/(pi*pow(h,3)))*2.0*pow((1-rr/h),3);
                  }
                  int crho=iy+ix*ly;
                  //gassed.rhogas.h[crho]=1.0;
                  gassed.rhogas.h[crho]=gassed.rhogas.h[crho]+wrh;
                  //if(x>
                  //printf("\n %d %d %d %d %d %f",igas, ii,jj,xi,yj,wrh);
   }}}
   double maxrho=0.0;
   double minrho=100.0;
   for(int crho=0;crho<lx*ly;crho++){
      if(gassed.rhogas.h[crho]>maxrho) maxrho=gassed.rhogas.h[crho];
      if(gassed.rhogas.h[crho]<minrho) minrho=gassed.rhogas.h[crho];
   }
   printf("\n MAXrho= %lf  MINrho= %lf \n",maxrho,minrho);

   cudaMemcpy(gassed.rhogas.d,gassed.rhogas.h, sizeof(double)*(total_size),cudaMemcpyHostToDevice) ;
//   printf("\n The gas dummy local density was initialized. The data was copied from host to device \n"); 
}   

__host__ void sphinit(field gassed,modelparams parameters)
{  
   double pi=acos(-1.0);

   int lx=parameters.lx;
   int ly=parameters.ly;
//   int total_size=parameters.total_size;
   double dx=parameters.dx;
   double dy=parameters.dy;
   double h=parameters.hgas;
   int ihx=floor(h/dx)+1,ihy=floor(h/dy)+1;
   
   int igas=0;
   for(int ii=0; ii<16;ii++){
      for(int jj=0;jj<16;jj++){
            double x=ii*((lx*dx)/16.0);
            double y=jj*((ly*dy)/16.0);
            gassed.xgas.h[igas]=x;
            gassed.ygas.h[igas]=y;
            igas++;
   }}
   printf("\n igas= %d Ngas= %d ihx= %d ihy= %d\n", igas,  Ngas,ihx,ihy);
   cudaMemcpy(gassed.xgas.d,gassed.xgas.h, sizeof(double)*(Ngas),cudaMemcpyHostToDevice) ;
//   printf("\n The gas sph particles were initialized. The data was copied from host to device \n");  
}
__host__ void sphoscillatory(uchar4* devPtr,field *gassed,modelparams *parameters,float ticks)
{  
   double pi=acos(-1.0);

   double omega=parameters[0].omega;
   int lx=parameters[0].lx;
   int ly=parameters[0].ly;
//   int total_size=parameters[0].total_size;
   double dx=parameters[0].dx;
   double dy=parameters[0].dy;
   double h=parameters[0].hgas;
   int ihx=floor(h/dx)+1,ihy=floor(h/dy)+1;
   
   int igas=0;
   for(int ii=0; ii<16;ii++){
      for(int jj=0;jj<16;jj++){
            double x=ii*((lx*dx)/16.0)+0.3*h*cos((1+igas)*ticks*2.0*pi/omega);
            double y=jj*((ly*dy)/16.0)+0.3*h*sin((1+igas)*ticks*2.0*pi/omega);
            gassed[0].xgas.h[igas]=x;
            gassed[0].ygas.h[igas]=y;
            igas++;
   }}
//   printf("\n igas= %d Ngas= %d ihx= %d ihy= %d\n", igas,  Ngas,ihx,ihy);
   cudaMemcpy(gassed[0].xgas.d,gassed[0].xgas.h, sizeof(double)*(Ngas),cudaMemcpyHostToDevice) ;
   printf("\n The gas sph particles were initialized. The data was copied from host to device \n");
   getrho(gassed[0],parameters[0]);  
   kernel<<<(lx*ly+512-1)/512,512>>>( devPtr, gassed[0].rhogas.d,0.0,2.475487);
}

void timerEvent(int value)
{
    if (glutGetWindow())
    {
        glutPostRedisplay();
        glutTimerFunc(REFRESH_DELAY, timerEvent,0);
    }
}

void computeFPS()
{
    frameCount++;
    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        avgFPS = 1.f / (eltime/ 1000.f);
        fpsCount = 0;
        fpsLimit = (int)MAX(avgFPS, 1.f);

    }

    char fps[256];
    sprintf(fps, "Cuda GL Interop (VBO): %3.1f fps (Max 100Hz)", avgFPS);
    glutSetWindowTitle(fps);
}

__host__ void animate(field *gassed,modelparams *parameters,float ttt){
   uchar4* devPtr;
   cudaGraphicsMapResources( 1, &resource, NULL );
   size_t  size;
   cudaGraphicsResourceGetMappedPointer( (void**)&devPtr,&size, resource);
   
   double pi=acos(-1.0);

//   int lx=parameters[0].lx;
//   int ly=parameters[0].ly;
   sphoscillatory(devPtr,gassed,parameters, ttt);
   cudaGraphicsUnmapResources( 1, &resource, NULL );
}

static void drawdraw( void) {
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    // we pass zero as the last parameter, because out bufferObj is now
    // the source, and the field switches from being a pointer to a
    // bitmap to now mean an offset into a bitmap object
    animate(globalgassed,globalparameters,g_fAnim);

    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
    glDrawPixels( DIM,DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
    glutSwapBuffers();
    g_fAnim +=0.01f;
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&eltime, start, stop);
    computeFPS();
    
}

void initglut(int argc, char **argv){
cudaGLSetGLDevice(0);
   // these GLUT calls need to be made before the other GL calls
   glutInit( &argc, argv );
   glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
   glutInitWindowSize( DIM, DIM );
   glutCreateWindow( "Test" );
   glutDisplayFunc( drawdraw );
   glutKeyboardFunc( key_func );
   
   glutTimerFunc(REFRESH_DELAY, timerEvent,0);
   
   glBindBuffer    = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
   glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
   glGenBuffers    = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
   glBufferData    = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");

    // the first three are standard OpenGL, the 4th is the CUDA reg 
    // of the bitmap these calls exist starting in OpenGL 1.5
   glGenBuffers( 1, &bufferObj );
   glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
   glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4, NULL, GL_DYNAMIC_DRAW_ARB );

   cudaGraphicsGLRegisterBuffer( &resource, bufferObj,  cudaGraphicsMapFlagsNone ) ;
}   
int main( int argc, char **argv )
{
   cudaSetDevice(0);
   size_t free, total;
   printf("\n");
   cudaMemGetInfo(&free,&total);   
   printf("%lu KB free of total %lu KB at the beginning\n",(free/1024),(total/1024));
   const int lx=DIM,ly=lx,total_size=lx*ly;
   dim3 gridrho,threadsrho;
//   FILE * pFile;
//   char conf[13];

   threadsrho.x=tpbrho;
   threadsrho.y=1;
   threadsrho.z=1;
   gridrho.x=(int)ceil((double)sqrt(total_size)/sqrt((double)threadsrho.x)); 
   gridrho.y=(int)ceil((double)(total_size)/((double)threadsrho.x*(double)gridrho.x))+1;
   gridrho.z=1;  
   printf("\n no of blocks %d, no of threades per block %d,\n total no of threads %d, total size=lx*ly %d\n", gridrho.x*gridrho.y,threadsrho.x,gridrho.x*gridrho.y*threadsrho.x,total_size);
   field gassed[1];
   modelparams parameters[1];
   parameters[0].omega=53;
   parameters[0].lx=lx;
   parameters[0].ly=ly;
   parameters[0].total_size=total_size;
   pointerinit(gassed,parameters);
   
   cudaMemGetInfo(&free,&total);   
   printf("\n%lu KB free of total %lu KB after all memory allocations\n",(free/1024),(total/1024));
   
   initglut(argc,argv);

glutMainLoop();

}