Hi to everyone,
I am trying to use CUDA to program a Smoothed Particle Hydrodynamics (SPH) application.I have previously programmed this application using fortran and C++ (with openMP) and I would like to use CUDA to utilise my GTX295 graphics card capabilities. So I have read the CUDA programming guide, run succesfully some examples and I tried to proceed with the implementation of SPH using CUDA.
Since I am a new user I have changed just one subroutine of the C++ algorithm and modified it to run on the CUDA device just to see how it would behave. I am comparing the results of the subroutine programmed in CUDA, with the results of the same subroutine of the old code in C++ run on the CPU. The problem is that when I run the CUDA code with device emulation it runs as it should, giving the same results with the C++ code. But when I run the CUDA code on device it gives me totally erroneous values. So I would like to ask from more experienced CUDA users to spare some time, if possible and tell me any ideas about what might be going on.
To explain a bit further what I am doing :

I initialize variables on the host.

After that I assign pointers to all arrays that need to be passed to the device. Also I create pointers for these arrays on the device and I allocate some space for them using cudaMalloc(). I transfer these arrays to the device using cudaMemcpy. I have tested that these arrays are transfered correctly on device, by copying them back to a test array on host (using cudaMemcpy again) and printing their elements (performed by the device not emulation).
[codebox]
//  Pointer declarations for data transfer 
//  Host pointers (h > host ) 
double * p_X_h = X, * p_Y_h = Y, * p_Z_h = Z, // coordinates
* p_U_h = U, * p_V_h = V, * p_W_h = W, // velocity components
* p_rho_h = rho, * p_p_h = p, // density & pressure
* p_Sdudt_h=Sdudt, * p_Sdvdt_h=Sdvdt,
* p_Sdwdt_h=Sdwdt, * p_Sdrhodt_h=Sdrhodt; // derivatives
short int * p_type_h = type; // particle type
long int * p_numpart_h=number_of_particles_at_cell,
(* p_pindex_h)[particles_per_cell]=particle_index; // linked list
//  Device pointers (d > device ) 
double * p_X_d , * p_Y_d, * p_Z_d, // coordinates
* p_U_d, * p_V_d, * p_W_d, // velocity components
* p_rho_d, * p_p_d, // density & pressure
* p_Sdudt_d, * p_Sdvdt_d,
* p_Sdwdt_d, * p_Sdrhodt_d; // derivatives
short int * p_type_d; // particle type
long int * p_numpart_d,(* p_pindex_d)[particles_per_cell]; // linked list
//
// Calculating the size of arrays to be tranfered 
long int array_size_double = sizeof(double) * (ntot+1) ;
long int array_size_type = sizeof (short int ) * (ntot+1) ;
long int array_size_numpart= sizeof (long int ) * (ncell_x * ncell_y * ncell_z + 1);
long int array_size_pindex= sizeof (long int ) * (ncell_x * ncell_y * ncell_z + 1)*particles_per_cell;
//  Allocating memory space on device 
cudaMalloc ((void ** ) &p_X_d, array_size_double );
cudaMalloc ((void ** ) &p_Y_d, array_size_double );
cudaMalloc ((void ** ) &p_Z_d, array_size_double );
cudaMalloc ((void ** ) &p_U_d, array_size_double );
cudaMalloc ((void ** ) &p_V_d, array_size_double );
cudaMalloc ((void ** ) &p_W_d, array_size_double );
cudaMalloc ((void ** ) &p_rho_d, array_size_double );
cudaMalloc ((void ** ) &p_p_d, array_size_double );
cudaMalloc ((void ** ) &p_type_d, array_size_type );
cudaMalloc ((void ** ) &p_numpart_d, array_size_numpart);
cudaMalloc ((void ** ) &p_pindex_d, array_size_pindex );
cudaMalloc ((void ** ) &p_Sdrhodt_d, array_size_double );
cudaMalloc ((void ** ) &p_Sdudt_d, array_size_double );
cudaMalloc ((void ** ) &p_Sdvdt_d, array_size_double );
cudaMalloc ((void ** ) &p_Sdwdt_d, array_size_double );
//  Transfering array elements 
cudaMemcpy (p_X_d,p_X_h,array_size_double,cudaMemcpyHostToDevice );
cudaMemcpy (p_Y_d,p_Y_h,array_size_double,cudaMemcpyHostToDevice );
cudaMemcpy (p_Z_d,p_Z_h,array_size_double,cudaMemcpyHostToDevice );
cudaMemcpy (p_U_d,p_U_h,array_size_double,cudaMemcpyHostToDevice );
cudaMemcpy (p_V_d,p_V_h,array_size_double,cudaMemcpyHostToDevice );
cudaMemcpy (p_W_d,p_W_h,array_size_double,cudaMemcpyHostToDevice );
cudaMemcpy (p_rho_d,p_rho_h,array_size_double,cudaMemcpyHostToDevice );
cudaMemcpy (p_p_d,p_p_h,array_size_double,cudaMemcpyHostToDevice );
cudaMemcpy (p_type_d,p_type_h,array_size_type,cudaMemcpyHostToDevice );
cudaMemcpy (p_numpart_d,p_numpart_h,array_size_numpart,cudaMemcpyHostTo
Device );
cudaMemcpy (p_pindex_d,p_pindex_h,array_size_pindex,cudaMemcpyHostToDev
ice );
[/codebox]
ntot is the total number of particles and nwall are the wall particles. Finally the fluid particles are nfluid=ntotnwall. Their indexes are set so that the first fluid particle is nwall+1 and the last is ntot.
 Then I run the CUDA kernel. I have tried several different block sizes, but got the same results. The number of threads and blocks is arranged in order every (fluid) particle is calculated by an individual thread.
[codebox]
// Arranging workload on device cores  (A block contains maximum 512 threads blocksize <=512 ) 
int blocksize= 100 ;
int num_blocks= (ntotnwall)/blocksize + ((ntotnwall)%blocksize == 0 ? 0 : 1) ;
derivative_calculation <<<num_blocks,blocksize>>> (p_X_d,p_Y_d,p_Z_d,p_U_d,p_V_d,p_W_d,
p_rho_d,p_p_d,p_type_d,p_numpart_d,p_pindex_d,p_Sdrhodt_d,
p_Sdudt_d,p_Sdvdt_d,p_Sdwdt_d,gx,gy,gz,xmin,ymin,zmin,
cell_dimension,NXmin,NXmax,NYmin,NYmax,NZmin,NZmax,
ncell_x,ncell_y,ncell_z,smoothing_length,mass,
dynamic_viscosity,D_boundary,r_0,m_1,m_2,c_1,c_2,ntot,
nwall,tiny);
[/codebox]
Fluid particles are the only to have acceleration but they still interact with the wall particles. After that, results for the derivatives are returned to the host
[codebox]
// Copying back the derivatives as they were calculated by the device 
cudaMemcpy (p_Sdrhodt_h,p_Sdrhodt_d,array_size_double,
cudaMemcpyDeviceToHost );
cudaMemcpy (p_Sdudt_h,p_Sdudt_d,array_size_double,
cudaMemcpyDeviceToHost );
cudaMemcpy (p_Sdvdt_h,p_Sdvdt_d,array_size_double,
cudaMemcpyDeviceToHost );
cudaMemcpy (p_Sdwdt_h,p_Sdwdt_d,array_size_double,
cudaMemcpyDeviceToHost );
//
[/codebox]
 The CUDA kernel consists of several loops, calculating the interparticle interactions.
[codebox]
//  CUDA kernels 
global void derivative_calculation (double * p_X_d,double * p_Y_d,double * p_Z_d,double * p_U_d,double * p_V_d,
double *p_W_d, double * p_rho_d, double * p_p_d, short int * p_type_d, long int * p_numpart_d,
long int (* p_pindex_d)[1000], double * p_Sdrhodt_d, double * p_Sdudt_d,double * p_Sdvdt_d,double * p_Sdwdt_d,
double gx,double gy, double gz, double xmin,double ymin,double zmin,double cell_dimension,long int NXmin,
long int NXmax,long int NYmin,long int NYmax,long int NZmin,long int NZmax,double ncell_x, double ncell_y,
double ncell_z, double smoothing_length,double mass,double dynamic_viscosity,double D_boundary,
double r_0,double m_1,double m_2,double c_1,double c_2,long int ntot,long int nwall,double tiny)
{
long int i,NXgrid,NYgrid,NZgrid,NX_start,NX_end,NY_start,NY_end,
NZ_start,NZ_end,n1,n2,n3,n_position,nparticles,n4,j;
double difx,dify,difz,rab,q,Wi,dwdr,dwdx,dwdy,dwdz,Pabx,Paby,Pabz,
BFz;
i= blockIdx.x * blockDim.x + threadIdx.x+nwall+1; // nwall+1 is added so that only fluid particles are calulated
if (i <= ntot)
{
p_Sdrhodt_d[i]=0.;
p_Sdudt_d[i]=gx;
p_Sdvdt_d[i]=gy;
p_Sdwdt_d[i]=gz;
NXgrid=int((p_X_d[i]xmin)/cell_dimension);
NYgrid=int((p_Y_d[i]ymin)/cell_dimension);
NZgrid=int((p_Z_d[i]zmin)/cell_dimension);
NX_start=(NXmin > NXgrid1 ? NXmin : NXgrid1);
NX_end=(NXmax<NXgrid+1 ? NXmax : NXgrid+1 );
NY_start=(NYmin > NYgrid1 ? NYmin : NYgrid1);
NY_end=(NYmax<NYgrid+1 ? NYmax : NYgrid+1 );
NZ_start=(NZmin > NZgrid1 ? NZmin : NZgrid1);
NZ_end=(NZmax<NZgrid+1 ? NZmax : NZgrid+1 );
for ( n1=NX_start ; n1 <= NX_end; n1++)
{
for ( n2=NY_start ; n2 <= NY_end; n2++)
{
for ( n3=NZ_start ; n3 <= NZ_end; n3++)
{
n_position=ncell_y*ncell_z*n1+ncell_z*n2+n3+1;
nparticles=p_numpart_d[n_position];
if (nparticles != 0 )
{
for ( n4 = 1 ; n4 <= nparticles ; n4 ++ )
{
j=p_pindex_d[n_position][n4];
difx=p_X_d[i]p_X_d[j];
dify=p_Y_d[i]p_Y_d[j];
difz=p_Z_d[i]p_Z_d[j];
rab = sqrt(pow(difx,2.)+pow(dify,2.)+pow(difz,2.))+tiny;
q=rab/smoothing_length;
if (q <= 2.5)
{
if (q <= 1.5)
{
if (q <= 0.5)
{
Wi=c_1*(pow((2.5q),4)5.0*pow((1.5q),4)+10.*pow((0.5q),4));
dwdr=c_2*(4.*pow((2.5q),3.)20.*pow((1.5q),3.)+40.0*pow((0.5q),3));
}
else
{
Wi=c_1*(pow((2.5q),4)5.0*pow((1.5q),4));
dwdr=c_2*(4.*pow((2.5q),3.)20.*pow((1.5q),3.));
}
}
else
{
Wi=c_1*(pow((2.5q),4));
dwdr=c_2*(4.*pow((2.5q),3.));
}
}
else
{
Wi=0.;
dwdr=0.;
}
dwdx=dwdr*difx/rab;
dwdy=dwdr*dify/rab;
dwdz=dwdr*difz/rab;
// Derivative of density
p_Sdrhodt_d[i]=p_Sdrhodt_d[i]+mass*((p_U_d[i]p_U_d[j])*dwdx+(p_V_d[i]p_V_d[j])*dwdy
+(p_W_d[i]p_W_d[j])*dwdz);
// Morris viscosity term
Pabx=2*dynamic_viscosity/(p_rho_d[i]*p_rho_d[j])*(p_U_d[i]p_U_d[j])/rab*dwdr;
Paby=2*dynamic_viscosity/(p_rho_d[i]*p_rho_d[j])*(p_V_d[i]p_V_d[j])/rab*dwdr;
Pabz=2*dynamic_viscosity/(p_rho_d[i]*p_rho_d[j])*(p_W_d[i]p_W_d[j])/rab*dwdr;
//Velocity components derivatives
p_Sdudt_d[i]=mass*((p_p_d[i]/pow(p_rho_d[i],2.)+p_p_d[j]/pow(p_rho_d[j],2.))*dwdxPabx)
+p_Sdudt_d[i];
p_Sdvdt_d[i]=mass*((p_p_d[i]/pow(p_rho_d[i],2.)+p_p_d[j]/pow(p_rho_d[j],2.))*dwdyPaby)
+p_Sdvdt_d[i];
p_Sdwdt_d[i]=mass*((p_p_d[i]/pow(p_rho_d[i],2.)+p_p_d[j]/pow(p_rho_d[j],2.))*dwdzPabz)
+p_Sdwdt_d[i];
if (p_type_d [j] == 1 ) // if type [j]=1 boundary particle
{
if (p_Z_d[i] <= p_Z_d[j]+r_0)
{
BFz=D_boundary*(pow((r_0/difz),m_1)pow((r_0/difz),m_2))/difz ;
p_Sdwdt_d[i]=p_Sdwdt_d[i]+BFz;
}
}
}
}
}
}
}
}
}
[/codebox]
I have noticed that the results that are returned from the device are as if the CUDA kernel function was never called. Actually I would expect some strange values for the derivatives for wall particles (since the only derivatives that are calulated by the CUDA function are the derivatives of fluid particles), but I can’t understand why I get such values for fluid particles when I run the kernel on the device,whereas on emulation gives correct results.
So has anyone any idea why this happens? Am I missing anything?
Thanks in advance.