Function only works correctly with cuda 8.0 in release mode.

Hello,

I have a peculiar problem. I wrote a kernel and it seems to work correctly when I compile it cu cuda 8.0 on 3 different devices (GTX 1080, 980 and 750), but not on Tesla40c and GTX 760m where I have cuda 6.5 and 7.5, respectively. Also the program does not run when I add -g -G to compiling.

Here is the function and the corresponding -Xptxas -v output:

#define tpb 32
#define newtpb 864



__global__ 
//__launch_bounds__(newtpb, 2 )
void alt_forces_shrange
(float3 l2, float3 l, int* mapd, int* celld, 
int* npcd, int* lstcd, float3 *r_d, float3 *a_d, float *w_d)
{ 

  // __shared__ float3 shposip[tpb]; // tpb=nmaxcd must be  defined at beginning
  // __shared__ int shin_celli[tpb];
   __shared__ float4 shacc[newtpb];
   
   
   int celi=blockIdx.x;   
   
   int tid=threadIdx.x;
   
   int ni_celli=tid/32; //this number is between 0 and 26, the warp id   
   
   int np_celli=npcd[celi]; //number of particles in celli
   
   int celj=mapd[celi*27+ni_celli]; //index of the local neighboring cell
   
   int np_cellj=npcd[celj]; //number of particles in cellj
   
   int myjp=-100;       // some of the threads will do nothing becasue tthere are less than 32 particles
   
   int s=tid%32; // each thread gets a number between 0 and 31
   
   //if(s==0 && celi==33)  {printf(" %d %d %d %d %d tid= %d\n", celi,celj,ni_celli,np_celli,np_cellj,tid);}
   
   
   float3 myPosjp;
   
   if(s<np_cellj)
   {
      myjp=lstcd[celj*nmaxc+s];
      myPosjp=r_d[myjp];
   } 
      __syncthreads(); 
   
   
   
   
   // we load the particles of the celi on the shared memory in each cell at all time
   
   /*
   if(npcd[celi]>32 || npcd[celj]>32) 
   {printf("\n Shhhiiiit  \n");}
   */
   
   /*
   
   if(tid<np_celli)
   {
      int ip=lstcd[celi*nmaxcd+tid];
      shin_celli[tid]=ip;
      shposip[tid]=r_d[ip];
   } 
   */
   
   __syncthreads(); //at this point all data should be on in the block variables
   
  
   for(int iii=0;iii<np_celli;iii++)
   {        
        int myip=lstcd[celi*nmaxc+iii]; //        int myip=shin_celli[iii];
        float3 myPosip=r_d[myip];         //float3 myPosip=shposip[iii];
        float4 acc={0.0,0.0,0.0,0.0};    
        shacc[tid]=acc;
        __syncthreads();
        
        if( myjp>=0 && myjp!=myip && s<np_cellj) //to avoid selfinteraction
        {
                acc= newbodyBodyInteraction(l2 ,l ,myPosip, myPosjp);
         
        }        
        shacc[tid]=acc;
        __syncthreads();

// reduction on the shared memory
        if(tid>=512)
        {
                shacc[tid-512].x+=shacc[tid].x;
                shacc[tid-512].y+=shacc[tid].y;
                shacc[tid-512].z+=shacc[tid].z;
                shacc[tid-512].w+=shacc[tid].w;                
        }
        __syncthreads();
        
        for(int l=256;l>=1;l=l/2)
        {
                if(tid<l)
                {
                        shacc[tid].x+=shacc[tid+l].x;
                        shacc[tid].y+=shacc[tid+l].y;
                        shacc[tid].z+=shacc[tid+l].z;
                        shacc[tid].w+=shacc[tid+l].w; 
                }
                __syncthreads();
        }
        __syncthreads();
        
        if(tid==0)
        {
        a_d[myip].x = shacc[0].x;
        a_d[myip].y = shacc[0].y;
        a_d[myip].z = shacc[0].z;
        w_d[myip] = shacc[0].w;
        }
        __syncthreads();
   }
}



alt_forces_shrange<<<ncel,newtpb>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);
$ nvcc -O2 -Xptxas -v -arch=sm_30 lj_all_y00.cu

ptxas info    : Function properties for _Z18alt_forces_shrange6float3S_PiS0_S0_S0_PS_S1_Pf
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 39 registers, 13824 bytes smem, 400 bytes cmem[0], 32 bytes cmem[2]

Please be more specific about “does not run”. What are the exact symptoms? Code compiled in debug mode may run up to 10 times more slowly than the corresponding release build. This in turn could trigger a watchdog timer event, causing the CUDA context to be destroyed and the app terminated abnormally.

Please detail the steps you have taken so far to diagnose and debug the issue, so other posters do not have to make assumptions about what has been been tried and what hasn’t been tried yet.

If you want help for questions like this, you’re more likely to get help if you make it as easy as possible for others to help you.

I would suggest the following:

  1. Provide a COMPLETE code. That is something that someone could copy, paste, compile, and run, and see the problem, without having to add anything or change anything. Try to reduce your code to the minimum necessary to demonstrate the problem. For example, if you can demonstrate the problem with 3 loop iterations, don’t run thousands of iterations.
  2. Give the exact compilation command you used to compile the code.
  3. Describe your runtime environment:
  • CUDA version
  • GPU driver version
  • OS details
  • the GPU model you are running on
  1. Give a complete description of any inputs required and outputs expected from the code, as well as a description of how to determine whether the code is behaving correctly or incorrectly.
  2. It’s generally good practice to make sure your code is doing proper CUDA error checking, and you have run your code with cuda-memcheck, before asking for help. This will be useful to others who are trying to help you.

You can, of course, post more or less anything you wish on this site. I frequently ignore posts that are missing the above, yet are asking for this sort of debugging help. My suggestions are simply offered to increase your chances of getting useful help.

Hello,

Thank you for answering. When I run the code with cuda other than 8.0 or add debug information I get the wrong results. I thought that usually is the opposite the code work when compile -g -G and not in release mode. I could have attached a file, however it has more than 1000 lines and lots of garbage because it is a developing code. I thought that the problem is related to the calling. like the resources are too much.

Here is the code. At line 1062 there are three functions which work. One can switch between them and see what is correct and is incorrect output. But it is correct it will start with the forth column of the output with numbers around -3 and then goes up up to 0.1, while when it wrong it will output very large numbers and then the code becomes unstable numerically. Particles are not moved properly. If you change the parameters it is not warrantied it will work because the number of threads per is connected to the physical parameters like the range of the interactions.

// version 01.08.16
// Pair potential
// nvcc -arch=sm_20 -ptxas-options=-v
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include <fstream>
#include <cuda.h>
#include <cuda_runtime.h>
using namespace std;
#define threads 256
#define tpb 32
#define newtpb 864

const int nx=16;
const int ny=16;
const int nz=16;
const int n=4*nx*ny*nz;
const int blocks = (int)((n + threads - 1) / threads);
float rho=1.00;
float lc=pow(4.0/rho,1.0/3.0);
float lx=nx*lc;
float ly=ny*lc;
float lz=nz*lc;
float rv=1.0*2.50;
const float rc=2.50;
const float rc2=rc*rc;
const float csr=0.0010;
const float qp=0.010;
const float dt=0.010;
const float cv=dt/2.0;
const float cf=dt*dt/2.0;
const int mx=(int)(lx/rv);
const int my=(int)(ly/rv);
const int mz=(int)(lz/rv);
const int nmaxc=tpb;
const int ncel=mx*my*mz;
float cx=lx/(float)(mx);
float cy=ly/(float)(my);
float cz=lz/(float)(mz);
__constant__ float rcd;
__constant__ float rc2d;
__constant__ float csrd;
__constant__ float qpd;
__constant__ float dtd;
__constant__ float cvd;
__constant__ float cfd;
__constant__ int mxd;
__constant__ int myd;
__constant__ int mzd;
__constant__ int nmaxcd;
__constant__ int nceld;

//===========================================================================
// kernel function: particle positions in fcc
//===========================================================================

void c_fcc(float d,float3 l,float3 *r)
{
  float d2=d/2.0;
  float3 l2={l.x/2.0, l.y/2.0, l.z/2.0};
  int s=0;
  for (int k=0;k<nz;k++)
{ for (int i=0;i<nx;i++)
{ for (int j=0;j<ny;j++)
{ r[s].x=-l2.x+float(i)*d+d2;
  r[s].y=-l2.y+float(j)*d+d2;
  r[s].z=-l2.z+float(k)*d+d2;
  s=s+1;
  r[s].x=-l2.x+float(i)*d+d;
  r[s].y=-l2.y+float(j)*d+d;
  r[s].z=-l2.z+float(k)*d+d2;
  s=s+1;
  r[s].x=-l2.x+float(i)*d+d;
  r[s].y=-l2.y+float(j)*d+d2;
  r[s].z=-l2.z+float(k)*d+d;
  s=s+1; 
  r[s].x=-l2.x+float(i)*d+d2;
  r[s].y=-l2.y+float(j)*d+d;
  r[s].z=-l2.z+float(k)*d+d;
  s=s+1;   
} } } 

  for (int i=0;i<s;i++)
{ r[i].x=(r[i].x>=l2.x?r[i].x-=l.x:r[i].x=r[i].x);
  r[i].x=(r[i].x<-l2.x?r[i].x+=l.x:r[i].x=r[i].x);
  r[i].y=(r[i].y>=l2.y?r[i].y-=l.y:r[i].y=r[i].y);
  r[i].y=(r[i].y<-l2.y?r[i].y+=l.y:r[i].y=r[i].y);
  r[i].z=(r[i].z>=l2.z?r[i].z-=l.z:r[i].z=r[i].z);
  r[i].z=(r[i].z<-l2.z?r[i].z+=l.z:r[i].z=r[i].z);
}

}

//===========================================================================
// kernel function: initial velocities
//===========================================================================

void c_vel(int seed, float temp, float3 *v)
{ 
  float3 sumv;
  sumv.x=0.0;
  sumv.y=0.0;
  sumv.z=0.0;
  srand(seed);
  for (int i=0;i<n;i++)
{ v[i].x=(float)rand()/(float)RAND_MAX-0.5;
  sumv.x+=v[i].x;
  v[i].y=(float)rand()/(float)RAND_MAX-0.5;
  sumv.y+=v[i].y; 
  v[i].z=(float)rand()/(float)RAND_MAX-0.5;
  sumv.z+=v[i].z;
  }
  sumv.x=sumv.x/float(n);
  sumv.y=sumv.y/float(n);
  sumv.z=sumv.z/float(n);
  float3 sumv2;
  sumv2.x=0.0;
  sumv2.y=0.0;
  sumv2.z=0.0;

  for (int i=0;i<n;i++)
{ v[i].x-=sumv.x;
  v[i].y-=sumv.y;
  v[i].z-=sumv.z;
  sumv2.x+=v[i].x*v[i].x;
  sumv2.y+=v[i].y*v[i].y;
  sumv2.z+=v[i].z*v[i].z;
  }
  float fs=sqrt(3.0*float(n)*temp/(sumv2.x+sumv2.y+sumv2.z));
  for (int i=0;i<n;i++)
{ v[i].x=v[i].x*fs;
  v[i].y=v[i].y*fs;
  v[i].z=v[i].z*fs;
  }
}

//===========================================================================
// kernel function: total forces on CPU
//===========================================================================

void cpu_forces(float3 l2, float3 l, float3 *r, float3 *a)
{
  for (int i=0;i<n;i++)
{ float3 bi=r[i];
  float3 ai={0.0, 0.0, 0.0};
  for (int j=0;j<n;j++)
{ float3 bj=r[j];
  float3 rij;
  rij.x = bi.x - bj.x;
  rij.y = bi.y - bj.y;
  rij.z = bi.z - bj.z;
  rij.x=(rij.x>=l2.x?rij.x-=l.x:rij.x=rij.x);
  rij.x=(rij.x<-l2.x?rij.x+=l.x:rij.x=rij.x);
  rij.y=(rij.y>=l2.y?rij.y-=l.y:rij.y=rij.y);
  rij.y=(rij.y<-l2.y?rij.y+=l.y:rij.y=rij.y);
  rij.z=(rij.z>=l2.z?rij.z-=l.z:rij.z=rij.z);
  rij.z=(rij.z<-l2.z?rij.z+=l.z:rij.z=rij.z);  
  float rij2 = rij.x * rij.x + rij.y * rij.y + rij.z * rij.z;
  if (rij2<rc2&i!=j) 
{ float absrij=sqrt(rij2);
  float rmrc= absrij-rc;
  float rmrc2=rmrc*rmrc;
  float rmrc4=rmrc2*rmrc2;
  float irij2 = 1.0/rij2;
  float irij6 = irij2 * irij2 * irij2;
  float ur = 4.0 * irij6 * (irij6-1.0);
  float u1r = -48.0 * irij6 * (irij6 - 0.50) * irij2;;
  float sr=rmrc4/(csr+rmrc4);
  float s1r=4.0*csr*rmrc*rmrc2/((csr+rmrc4)*(csr+rmrc4)*absrij);
  float fij=-(ur*s1r+u1r*sr);
  ai.x+= rij.x * fij;
  ai.y+= rij.y * fij;
  ai.z+= rij.z * fij;
} }
a[i]=ai; 
} }

//===========================================================================
// device function: pair interactions
//===========================================================================

__device__ float4
bodyBodyInteractionx(float3 l2, float3 l, float3 bi, float3 bj, float4 ai)
{
  float3 rij;
  rij.x = bi.x - bj.x;
  rij.y = bi.y - bj.y;
  rij.z = bi.z - bj.z;
  rij.x=(rij.x>=l2.x?rij.x-=l.x:rij.x=rij.x);
  rij.x=(rij.x<-l2.x?rij.x+=l.x:rij.x=rij.x);
  rij.y=(rij.y>=l2.y?rij.y-=l.y:rij.y=rij.y);
  rij.y=(rij.y<-l2.y?rij.y+=l.y:rij.y=rij.y);
  rij.z=(rij.z>=l2.z?rij.z-=l.z:rij.z=rij.z);
  rij.z=(rij.z<-l2.z?rij.z+=l.z:rij.z=rij.z);  
  float rij2 = rij.x * rij.x + rij.y * rij.y + rij.z * rij.z;
  if (rij2<rc2d) 
{ float irij2=1.0/rij2;
  float irij6 = irij2 * irij2 * irij2;
  float fij = 48.0 * irij6 * (irij6 - 0.50) * irij2;;
  ai.x+= rij.x * fij;
  ai.y+= rij.y * fij;
  ai.z+= rij.z * fij;
  ai.w+= rij2 * fij;
 }
  return ai;
}
//===========================================================================
// device function: pair interactions
//===========================================================================

__device__ float4
bodyBodyInteraction(float3 l2, float3 l, float3 bi, float3 bj, float4 ai)
{
  float3 rij;
  rij.x = bi.x - bj.x;
  rij.y = bi.y - bj.y;
  rij.z = bi.z - bj.z;
  rij.x=(rij.x>=l2.x?rij.x-=l.x:rij.x=rij.x);
  rij.x=(rij.x<-l2.x?rij.x+=l.x:rij.x=rij.x);
  rij.y=(rij.y>=l2.y?rij.y-=l.y:rij.y=rij.y);
  rij.y=(rij.y<-l2.y?rij.y+=l.y:rij.y=rij.y);
  rij.z=(rij.z>=l2.z?rij.z-=l.z:rij.z=rij.z);
  rij.z=(rij.z<-l2.z?rij.z+=l.z:rij.z=rij.z);  
  float rij2 = rij.x * rij.x + rij.y * rij.y + rij.z * rij.z;
  if (rij2<rc2d) 
{ float irij2=1.0/rij2;
  float irij6 = irij2 * irij2 * irij2;
  float absrij = sqrt(rij2);
  float rmrc= absrij-rcd;
  float rmrc2=rmrc*rmrc;
  float rmrc4=rmrc2*rmrc2;
  float ur = 4.0 * irij6 * (irij6-1.0);
  float u1r = -48.0 * irij6 * (irij6 - 0.50) * irij2;;
  float sr = rmrc4/(csrd+rmrc4);
  float s1r = 4.0*csrd*rmrc*rmrc2/((csrd+rmrc4)*(csrd+rmrc4)*absrij);
  float fij= -(ur*s1r+u1r*sr);
  ai.x+= rij.x * fij;
  ai.y+= rij.y * fij;
  ai.z+= rij.z * fij;
  ai.w+= rij2 * fij;
 }
  return ai;
}



__device__ float4
newbodyBodyInteraction(float3 l2, float3 l, float3 bi, float3 bj)
{
  float3 rij;
  float4 ai;
  rij.x = bi.x - bj.x;
  rij.y = bi.y - bj.y;
  rij.z = bi.z - bj.z;
  rij.x=(rij.x>=l2.x?rij.x-=l.x:rij.x=rij.x);
  rij.x=(rij.x<-l2.x?rij.x+=l.x:rij.x=rij.x);
  rij.y=(rij.y>=l2.y?rij.y-=l.y:rij.y=rij.y);
  rij.y=(rij.y<-l2.y?rij.y+=l.y:rij.y=rij.y);
  rij.z=(rij.z>=l2.z?rij.z-=l.z:rij.z=rij.z);
  rij.z=(rij.z<-l2.z?rij.z+=l.z:rij.z=rij.z);  
  float rij2 = rij.x * rij.x + rij.y * rij.y + rij.z * rij.z;
  if (rij2<rc2d) 
{ float irij2=1.0/rij2;
  float irij6 = irij2 * irij2 * irij2;
  float absrij = sqrt(rij2);
  float rmrc= absrij-rcd;
  float rmrc2=rmrc*rmrc;
  float rmrc4=rmrc2*rmrc2;
  float ur = 4.0 * irij6 * (irij6-1.0);
  float u1r = -48.0 * irij6 * (irij6 - 0.50) * irij2;;
  float sr = rmrc4/(csrd+rmrc4);
  float s1r = 4.0*csrd*rmrc*rmrc2/((csrd+rmrc4)*(csrd+rmrc4)*absrij);
  float fij= -(ur*s1r+u1r*sr);
  ai.x= rij.x * fij;
  ai.y= rij.y * fij;
  ai.z= rij.z * fij;
  ai.w= rij2 * fij;
 }
  return ai;
}


//===========================================================================
// device function: forces in GPU
//===========================================================================

__global__ void
calculate_forces(float3 l2, float3 l, float3 *r_d, float3 *a_d, float *w_d)
{
  __shared__ float3 shPosition[threads];
  float3 myPosition;
  float4 acc={0.0, 0.0, 0.0, 0.0};    
  int gtid = blockIdx.x * blockDim.x + threadIdx.x;
  myPosition = r_d[gtid];
  for (int tile = 0; tile < blocks; tile++) 
  {
    int idx = tile * threads + threadIdx.x;
    shPosition[threadIdx.x] = r_d[idx];
    __syncthreads();
  for (int j = 0; j < threads; j++) 
  { if(gtid!=j + tile * threads)
    {    
    acc = bodyBodyInteraction(l2 ,l ,myPosition, shPosition[j], acc);
    }
  }
    __syncthreads();
  }
  // Save the result in global memory for the integration step.
//  float4 acc3 = {acc.x, acc.y, acc.z, acc.w};
  a_d[gtid].x = acc.x;
  a_d[gtid].y = acc.y;
  a_d[gtid].z = acc.z;
  w_d[gtid] = acc.w;  
}

//===========================================================================
// cuda: first step velocity Verlet's integration
//===========================================================================

__global__ void intstep1(float3 l, float3 l2, float *cr_d, 
 float3 *r_d, float3 *v_d, float3 *a_d)
{ 
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  float3 ri, vi, ai;
  ri=r_d[i];
  ai=a_d[i];
  vi=v_d[i];
  float cri=cr_d[i];
  ri.x+=cri*(vi.x*dtd+ai.x*cfd);
  ri.y+=cri*(vi.y*dtd+ai.y*cfd);
  ri.z+=cri*(vi.z*dtd+ai.z*cfd);
  vi.x+=cri*ai.x*cvd;
  vi.y+=cri*ai.y*cvd;
  vi.z+=cri*ai.z*cvd;
  ri.x=(ri.x>=l2.x?ri.x-=l.x:ri.x=ri.x);
  ri.x=(ri.x<-l2.x?ri.x+=l.x:ri.x=ri.x);
  ri.y=(ri.y>=l2.y?ri.y-=l.y:ri.y=ri.y);
  ri.y=(ri.y<-l2.y?ri.y+=l.y:ri.y=ri.y);
  ri.z=(ri.z>=l2.z?ri.z-=l.z:ri.z=ri.z);
  ri.z=(ri.z<-l2.z?ri.z+=l.z:ri.z=ri.z);
  r_d[i]=ri;
  v_d[i]=vi;
}

//===========================================================================
// cuda: second step of velocity Verlet's integration
//===========================================================================

__global__ void intstep2(float *cr_d,float3 *v_d, float3 *a_d)
{ int i=blockIdx.x*blockDim.x+threadIdx.x;
  float3 vi=v_d[i];
  float3 ai=a_d[i];
  float cri=cr_d[i];
  vi.x+=cri*ai.x*cvd;
  vi.y+=cri*ai.y*cvd;
  vi.z+=cri*ai.z*cvd;
  v_d[i]=vi;
}

//===========================================================================
// cuda: first step velocity Verlet's integration
//===========================================================================

__global__ void intstep1z(float sfr, float3 l, float3 l2, float *cr_d, 
 float3 *r_d, float3 *v_d, float3 *a_d)
{ 
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  float3 ri, vi, ai;
  ri=r_d[i];
  ai=a_d[i];
  vi=v_d[i];
  float cri=cr_d[i];
  ri.z=((sfr-1.0)*cri+1.0)*ri.z;
  ri.x+=cri*(vi.x*dtd+ai.x*cfd);
  ri.y+=cri*(vi.y*dtd+ai.y*cfd);
  ri.z+=cri*(vi.z*dtd+ai.z*cfd);
  vi.x+=cri*ai.x*cvd;
  vi.y+=cri*ai.y*cvd;
  vi.z+=cri*ai.z*cvd;
  ri.x=(ri.x>=l2.x?ri.x-=l.x:ri.x=ri.x);
  ri.x=(ri.x<-l2.x?ri.x+=l.x:ri.x=ri.x);
  ri.y=(ri.y>=l2.y?ri.y-=l.y:ri.y=ri.y);
  ri.y=(ri.y<-l2.y?ri.y+=l.y:ri.y=ri.y);
  ri.z=(ri.z>=l2.z?ri.z-=l.z:ri.z=ri.z);
  ri.z=(ri.z<-l2.z?ri.z+=l.z:ri.z=ri.z);
  r_d[i]=ri;
  v_d[i]=vi;
}

//===========================================================================
// kernel function: write configuration
//===========================================================================

void c_confout
(int ist,int i,float3 l,float3 *r)
{
  char filename[256];
  ofstream file;
  sprintf(filename,"conf%d.%d",ist,i);
  file.open(filename);
{ file << l.x << '\t' << l.y << '\n';}
  for (int k=0;k<n;k++)
  {
  {file << r[k].x << '\t' << r[k].y << '\t' << r[k].z << '\n';}
  }
  file.close();
}

//===========================================================================
// kernel function: write velocities
//===========================================================================

void c_velout
(int ist, int i, float3 *v)
{
  char filename[256];
  ofstream file;
  sprintf(filename,"vel%d.%d",ist,i);
  file.open(filename);
  for (int k=0;k<n;k++)
  {
  {file << v[k].x << '\t' << v[k].y << '\t' << v[k].z << '\n';}
  }
  file.close();
}

//===========================================================================
// kernel function: read configuration
//===========================================================================

void c_confin
(int ist, int i, float3 l, float3 *r)
{
  char filename[256];
  ifstream file;
  sprintf(filename,"conf%d.%d",ist,i);
  file.open(filename);
  file >> l.x >> l.y >> l.z ;
  for (int k=0;k<n;k++)
  { 
  file >> r[k].x >> r[k].y >> r[k].z ;
  } 
  file.close();
}

//===========================================================================
// kernel function: read velocities
//===========================================================================

void c_velin
(int ist, int i, float3 *v)
{
  char filename[256];
  ifstream file;
  sprintf(filename,"vel%d.%d",ist,i);
  file.open(filename);
  for (int k=0;k<n;k++)
  {
  file >> v[k].x >> v[k].y >> v[k].z;
  }
  file.close();
}

//===========================================================================
// cuda: (twice) the particles kinetic energy
//===========================================================================

__global__ void ekini(float3 *v_d,float *k_d)
{ int i=blockIdx.x*blockDim.x+threadIdx.x;
  k_d[i]=v_d[i].x*v_d[i].x+v_d[i].y*v_d[i].y+v_d[i].z*v_d[i].z;
}

//===========================================================================
// device function: pair interactions
//===========================================================================

__device__ float
bodyBodyPotential(float3 l2, float3 l, float3 bi, float3 bj, float ui)
{
  float3 rij;
  ui=0.0;
  rij.x = bi.x - bj.x;
  rij.y = bi.y - bj.y;
  rij.z = bi.z - bj.z;
  rij.x=(rij.x>=l2.x?rij.x-=l.x:rij.x=rij.x);
  rij.x=(rij.x<-l2.x?rij.x+=l.x:rij.x=rij.x);
  rij.y=(rij.y>=l2.y?rij.y-=l.y:rij.y=rij.y);
  rij.y=(rij.y<-l2.y?rij.y+=l.y:rij.y=rij.y);
  rij.z=(rij.z>=l2.z?rij.z-=l.z:rij.z=rij.z);
  rij.z=(rij.z<-l2.z?rij.z+=l.z:rij.z=rij.z);  
  float rij2 = rij.x * rij.x + rij.y * rij.y + rij.z * rij.z;
  if (rij2<rc2d) 
{ float irij2 = 1.0/rij2;
  float irij6 = irij2 * irij2 * irij2;
  float rmrc = sqrt(rij2)-rcd;
  float rmrc2 = rmrc*rmrc;
  float rmrc4 = rmrc2*rmrc2;
  float sr = rmrc4/(csrd+rmrc4);
  float uij = 4.0 * sr * irij6 * (irij6-1.0);
  ui+= uij;
 }
  return ui;
}

//===========================================================================
// device function: forces in GPU
//===========================================================================

__global__ void
epoti(float3 l2, float3 l, float3 *r_d, float *u_d)
{
  __shared__ float3 shPosition[threads];
  float3 myPosition;
  float ui = 0.0;  
  int gtid = blockIdx.x * blockDim.x + threadIdx.x;
  myPosition = r_d[gtid];
  for (int tile = 0; tile < blocks; tile++) 
  {
    int idx = tile * threads + threadIdx.x;
    shPosition[threadIdx.x] = r_d[idx];
    __syncthreads();
  for (int j = 0; j < threads; j++) 
  { if(gtid!=j + tile * threads)
    {    
    ui = bodyBodyPotential(l2 ,l ,myPosition, shPosition[j], ui);
    }
  }
    __syncthreads();
  }
  // Save the result in global memory 
  u_d[gtid] = ui;
}

//===========================================================================
// device function: sum elements of vector a (reduction)
//===========================================================================

__global__ void sumvec(float *a, float *c, int blocks)
{
__shared__ float cache[threads];
int tid=threadIdx.x+blockIdx.x*blockDim.x;
int cacheIndex=threadIdx.x;
float temp=0.0;
while (tid<blocks)
{
temp +=a[tid];
tid+=blockDim.x*gridDim.x;
}
cache[cacheIndex]=temp;
__syncthreads();
int i=blockDim.x/2;
while (i!=0)
{
if (cacheIndex<i)
cache[cacheIndex]+=cache[cacheIndex+i];
__syncthreads();
i/=2;
}
if (cacheIndex==0)
c[blockIdx.x]=cache[0];
}

//===========================================================================
// device: scale coordinates by a factor sf
//===========================================================================

__global__ void scalcoord(float sf,float3 *v_d) 
{ 
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  v_d[i].x=v_d[i].x*sf;
  v_d[i].y=v_d[i].y*sf;
  v_d[i].z=v_d[i].z*sf;
}

//===========================================================================
// device: define values 0 or 1
//===========================================================================

__global__ void setval(float a, float b, float3 *r_d, float *cr_d) 
{ 
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  cr_d[i]=1.0;
  if ((r_d[i].z>a)&&(r_d[i].z<b))
  cr_d[i]=0.0;
}

//===========================================================================
// cuda function: forces (version b)
//===========================================================================

__global__ void calculate_forcesb
(float3 l2, float3 l, int* mapd, int* celld, 
int* npcd, int* lstcd, float3 *r_d, float3 *a_d, float *w_d)
{ int i=blockIdx.x*blockDim.x+threadIdx.x;
  float4 acc={0.0, 0.0, 0.0, 0.0}; 
  float3 myPosition = r_d[i];
  int offseti=celld[i]*27;
  for (int t=0;t<27;t++)
{ int celj=mapd[offseti+t];
  int offsetj=celj*nmaxcd;
  for (int k=0;k<npcd[celj];k++)
{ int j=lstcd[offsetj+k];
  if (j!=i)
  acc = bodyBodyInteraction(l2 ,l ,myPosition, r_d[j], acc); 
}
}
  a_d[i].x = acc.x;
  a_d[i].y = acc.y;
  a_d[i].z = acc.z;
  w_d[i] = acc.w;
}


//===========================================================================
// cuda function: forces
//===========================================================================

__global__ void forces_shrange
(float3 l2, float3 l, int* mapd, int* celld, 
int* npcd, int* lstcd, float3 *r_d, float3 *a_d, float *w_d)
{ 
  int celi=blockIdx.x;
  int offseti=celi*27;
  int tid=threadIdx.x;
  int np_celi=npcd[celi];
  float3 myPosition;
  int myip;
   __shared__ float3 shposjp[tpb]; // nmaxcd must be  defined at beginning
  
  float4 acc={0.0, 0.0, 0.0, 0.0};
  if(tid<np_celi)
  {
       myip=lstcd[celi*nmaxcd+tid];
       myPosition = r_d[myip];
  }
  
  for (int t=0;t<27;t++)
  { 
        int celj=mapd[offseti+t];
        int offsetj=celj*nmaxcd;
        if(tid<npcd[celj])
        {
                shposjp[tid]=r_d[lstcd[offsetj+tid]];
        }
        __syncthreads(); 
   
   /*if(npcd[celi]>32 || npcd[celj]>32) 
   {printf("\n Shhhiiiit  \n");}
   */     
        for(int j=0;j<npcd[celj];j++)
        {
                if((tid<np_celi) && (celi!=celj|| tid!=j)) //to avoid selfinteraction
                 {
                        acc = bodyBodyInteraction(l2 ,l ,myPosition, shposjp[j], acc);
                 }
        }
        __syncthreads();
   } 
   
   if(tid<np_celi)
  {
        a_d[myip].x = acc.x;
        a_d[myip].y = acc.y;
        a_d[myip].z = acc.z;
        w_d[myip] = acc.w;
  }
}



//===========================================================================
// cuda function: forces, alternate
//===========================================================================


__global__ 
//__launch_bounds__(newtpb, 2 )
void alt_forces_shrange
(float3 l2, float3 l, int* mapd, int* celld, 
int* npcd, int* lstcd, float3 *r_d, float3 *a_d, float *w_d)
{ 

  // __shared__ float3 shposip[tpb]; // tpb=nmaxcd must be  defined at beginning
  // __shared__ int shin_celli[tpb];
   __shared__ float4 shacc[newtpb];
   
   
   int celi=blockIdx.x;   
   
   int tid=threadIdx.x;
   
   int ni_celli=tid/32; //this number is between 0 and 26, the warp id   
   
   int np_celli=npcd[celi]; //number of particles in celli
   
   int celj=mapd[celi*27+ni_celli]; //index of the local neighboring cell
   
   int np_cellj=npcd[celj]; //number of particles in cellj
   
   int myjp=-100;       // some of the threads will do nothing becasue tthere are less than 32 particles
   
   int s=tid%32; // each thread gets a number between 0 and 31
   
   //if(s==0 && celi==33)  {printf(" %d %d %d %d %d tid= %d\n", celi,celj,ni_celli,np_celli,np_cellj,tid);}
   
   
   float3 myPosjp;
   
   if(s<np_cellj)
   {
      myjp=lstcd[celj*nmaxc+s];
      myPosjp=r_d[myjp];
   } 
      __syncthreads(); 
   
   
   
   
   // we load the particles of the celi on the shared memory in each cell at all time
   
   /*
   if(npcd[celi]>32 || npcd[celj]>32) 
   {printf("\n Shhhiiiit  \n");}
   */
   
   /*
   
   if(tid<np_celli)
   {
      int ip=lstcd[celi*nmaxcd+tid];
      shin_celli[tid]=ip;
      shposip[tid]=r_d[ip];
   } 
   */
   
   __syncthreads(); //at this point all data should be on in the block variables
   
  
   for(int iii=0;iii<np_celli;iii++)
   {        
        int myip=lstcd[celi*nmaxc+iii]; //        int myip=shin_celli[iii];
        float3 myPosip=r_d[myip];         //float3 myPosip=shposip[iii];
        float4 acc={0.0,0.0,0.0,0.0};    
        shacc[tid]=acc;
        __syncthreads();
        
        if( myjp>=0 && myjp!=myip && s<np_cellj) //to avoid selfinteraction
        {
                acc= newbodyBodyInteraction(l2 ,l ,myPosip, myPosjp);
         
        }        
        shacc[tid]=acc;
        __syncthreads();

// reduction on the shared memory
        if(tid>=512)
        {
                shacc[tid-512].x+=shacc[tid].x;
                shacc[tid-512].y+=shacc[tid].y;
                shacc[tid-512].z+=shacc[tid].z;
                shacc[tid-512].w+=shacc[tid].w;                
        }
        __syncthreads();
        
        for(int l=256;l>=1;l=l/2)
        {
                if(tid<l)
                {
                        shacc[tid].x+=shacc[tid+l].x;
                        shacc[tid].y+=shacc[tid+l].y;
                        shacc[tid].z+=shacc[tid+l].z;
                        shacc[tid].w+=shacc[tid+l].w; 
                }
                __syncthreads();
        }
        __syncthreads();
        
        if(tid==0)
        {
        a_d[myip].x = shacc[0].x;
        a_d[myip].y = shacc[0].y;
        a_d[myip].z = shacc[0].z;
        w_d[myip] = shacc[0].w;
        }
        __syncthreads();
   }
}




__global__ 
//__launch_bounds__(newtpb, 2 )
void newalt_forces_shrange
(float3 l2, float3 l, int* mapd, int* celld, 
int* npcd, int* lstcd, float3 *r_d, float3 *a_d, float *w_d)
{ 

  // __shared__ float3 shposip[tpb]; // tpb=nmaxcd must be  defined at beginning
  // __shared__ int shin_celli[tpb];
   __shared__ float4 shacc[newtpb];
   
   
   int celi=blockIdx.x;   
   
   int tid=threadIdx.x;
   
   int ni_celli=tid/32; //this number is between 0 and 26, the warp id   
   
   int np_celli=npcd[celi]; //number of particles in celli
   
   int celj=mapd[celi*27+ni_celli]; //index of the local neighboring cell
   
   int np_cellj=npcd[celj]; //number of particles in cellj
   
   int myjp=-100;       // some of the threads will do nothing becasue tthere are less than 32 particles
   
   int s=tid%32; // each thread gets a number between 0 and 31
   
   //if(s==0 && celi==33)  {printf(" %d %d %d %d %d tid= %d\n", celi,celj,ni_celli,np_celli,np_cellj,tid);}
   
   
   float3 myPosjp;
   
   if(s<np_cellj)
   {
      myjp=lstcd[celj*nmaxc+s];
      myPosjp=r_d[myjp];
   } 
      __syncthreads(); 
   
   
   
   
   // we load the particles of the celi on the shared memory in each cell at all time
   
   /*
   if(npcd[celi]>32 || npcd[celj]>32) 
   {printf("\n Shhhiiiit  \n");}

   */
   
   /*
   

   if(tid<np_celli)
   {
      int ip=lstcd[celi*nmaxcd+tid];

      shin_celli[tid]=ip;
      shposip[tid]=r_d[ip];

   } 
   */
   
   __syncthreads(); //at this point all data should be on in the block variables
   
  
   for(int iii=0;iii<np_celli;iii++)
   {        
        int myip=lstcd[celi*nmaxc+iii]; //        int myip=shin_celli[iii];
        float3 myPosip=r_d[myip];         //float3 myPosip=shposip[iii];
        float4 acc={0.0,0.0,0.0,0.0};    
        shacc[tid]=acc;
        __syncthreads();
        
        if( myjp>=0 && myjp!=myip && s<np_cellj) //to avoid selfinteraction
        {
                acc= newbodyBodyInteraction(l2 ,l ,myPosip, myPosjp);
         
        }        
        shacc[tid]=acc;
        __syncthreads();

// reduction on the shared memory
        if(tid>=512)
        {
                shacc[tid-512].x+=shacc[tid].x;
                shacc[tid-512].y+=shacc[tid].y;
                shacc[tid-512].z+=shacc[tid].z;
                shacc[tid-512].w+=shacc[tid].w;                
        }
        __syncthreads();
        
        for(int l=256;l>=1;l=l/2)
        {
                if(tid<l)
                {
                        shacc[tid].x+=shacc[tid+l].x;
                        shacc[tid].y+=shacc[tid+l].y;
                        shacc[tid].z+=shacc[tid+l].z;
                        shacc[tid].w+=shacc[tid+l].w; 
                }
                __syncthreads();
        }
        __syncthreads();
        
        if(tid==0)
        {
        a_d[myip].x = shacc[0].x;
        a_d[myip].y = shacc[0].y;
        a_d[myip].z = shacc[0].z;
        w_d[myip] = shacc[0].w;
        }
        __syncthreads();
   }
}

//===========================================================================
// cuda: cell index of particles
//===========================================================================


__global__ void celli(float3 c,float3 l2,float3 *r_d,int *celld)
{ 
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  int mxi,myi,mzi; 
  mxi=(int)((r_d[i].x+l2.x)/c.x);
  myi=(int)((r_d[i].y+l2.y)/c.y);
  mzi=(int)((r_d[i].z+l2.z)/c.z);
  if (mxi>=mxd) mxi-=mxd;
  if (myi>=myd) myi-=myd;
  if (mzi>=mzd) mzi-=mzd;
  mxi=(mxi+mxd)%mxd;
  myi=(myi+myd)%myd;
  mzi=(mzi+mzd)%mzd;
  
  celld[i]=mxi+myi*mxd+mzi*mxd*myd; 
}

//===========================================================================
// cuda: list of number and indexes of particles belonging to a given cell
//===========================================================================

__global__ void hist(int *npcd, int *lstcd,int *celld)
{ int j,temp;
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  j=celld[i];
  temp = atomicAdd(&npcd[j],1);
  lstcd[j*nmaxcd+temp]=i;
}

//===========================================================================
// kernel function: map cell and neighbor cell indexes
//===========================================================================

void c_initlcl(int *map)
{ int i,j,k,u,v,w,s;  
  for (i=0;i<mx;i++)
{ for (j=0;j<my;j++)
{ for (k=0;k<mz;k++)
{ int imap=((i+mx)%mx+((j+my)%my)*mx+((k+mz)%mz)*mx*my)*27;
  s=0;
  for (u=-1;u<=1;u++)
{ for (v=-1;v<=1;v++)
{ for (w=-1;w<=1;w++)
{ map[imap+s]=((i+u+mx)%mx+((j+v+my)%my)*mx+((k+w+mz)%mz)*mx*my);
  s=s+1;
} } } } } } }


//===========================================================================
// MAIN PROGRAM 
//===========================================================================

int main() {
  cudaSetDevice(0);
  int seed = 0;
  float3 l = {lx, ly, lz};
  float3 r[n], v[n];
  int map[27*ncel];
  float3 *r_d, *v_d, *a_d; 
  float *u_d, *k_d, *c1, *c2, *w_d, *cr_d;
  int *mapd, *celld, *npcd, *lstcd;  
  float ekin, epot, temp, temp0, w, sfv, pres, vol, sfr, pres0;
  temp0=0.50;
  pres0=0.10;
  float3 l2 = {l.x/2.0, l.y/2.0, l.z/2.0};
  float3 c = {l.x/(float)(mx), l.y/(float)(my), l.z/(float)(mz)};

  cudaMalloc((void**)&r_d, n*sizeof(float3));
  cudaMalloc((void**)&v_d, n*sizeof(float3));
  cudaMalloc((void**)&a_d, n*sizeof(float3));
  cudaMalloc((void**)&u_d, n*sizeof(float));
  cudaMalloc((void**)&k_d, n*sizeof(float));
  cudaMalloc((void**)&w_d, n*sizeof(float));
  cudaMalloc((void**)&c1, n*sizeof(float));
  cudaMalloc((void**)&c2, n*sizeof(float));
  cudaMalloc((void**)&cr_d, n*sizeof(float));
  cudaMalloc((void**)&mapd,27*ncel*sizeof(int));
  cudaMalloc((void**)&lstcd,ncel*nmaxc*sizeof(int));
  cudaMalloc((void**)&npcd,ncel*sizeof(int));
  cudaMalloc((void**)&celld,n*sizeof(int));
  cudaMemcpyToSymbol(rcd,&rc,sizeof(float));
  cudaMemcpyToSymbol(rc2d,&rc2,sizeof(float));
  cudaMemcpyToSymbol(csrd,&csr,sizeof(float));
  cudaMemcpyToSymbol(qpd,&qp,sizeof(float));
  cudaMemcpyToSymbol(dtd,&dt,sizeof(float));
  cudaMemcpyToSymbol(cvd,&cv,sizeof(float));
  cudaMemcpyToSymbol(cfd,&cf,sizeof(float));
  cudaMemcpyToSymbol(mxd,&mx,sizeof(int));
  cudaMemcpyToSymbol(myd,&my,sizeof(int));
  cudaMemcpyToSymbol(mzd,&mz,sizeof(int));
  cudaMemcpyToSymbol(nmaxcd,&nmaxc,sizeof(int));
  cudaMemcpyToSymbol(nceld,&ncel,sizeof(int));

  c_initlcl(map);
  c_fcc(lc,l,r);
  c_vel(seed,temp0,v);

  cudaMemcpy(r_d, r, n*sizeof(float3), cudaMemcpyHostToDevice);
  cudaMemcpy(v_d, v, n*sizeof(float3), cudaMemcpyHostToDevice); 
  cudaMemcpy(mapd,map,27*ncel*sizeof(int),cudaMemcpyHostToDevice);
 
  cudaMemsetAsync(npcd,0,ncel*sizeof(int));
  celli<<<blocks,threads>>>(c,l2,r_d,celld);
  hist<<<blocks,threads>>>(npcd,lstcd,celld);   
  calculate_forcesb<<<blocks,threads>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);

  sumvec<<<blocks,threads>>>(w_d,c1,n);
  sumvec<<<blocks,threads>>>(c1,c2,blocks);
  cudaMemcpy(&w,c2,sizeof(float),cudaMemcpyDeviceToHost);
  ekini<<<blocks,threads>>>(v_d, k_d);
  sumvec<<<blocks,threads>>>(k_d,c1,n);
  sumvec<<<blocks,threads>>>(c1,c2,blocks);
  cudaMemcpy(&ekin,c2,sizeof(float),cudaMemcpyDeviceToHost);
  temp=ekin/(3.0*float(n)); 
  vol=l.x*l.y*l.z;
  pres=(float(n)*temp+w/6.0)/vol;
  printf("%f\t%f\t%f\n",l.x,l.y,l.z); 
  printf("%f\t%f\n",ekin,temp); 
  printf("%f\t%f\n",vol,pres); 
  setval<<<blocks,threads>>>(l.z+1.0,-l.z-1.0,r_d,cr_d); 

//========================================================
// Main-Loop 1: 
//========================================================

  for (int i=0;i<30000;i++)
{
// propagation of coordinates and velocities
  sfr=1.0+qp*dt*(pres-pres0);
  l.x=sfr*l.x;
  l.y=sfr*l.y;
  l.z=sfr*l.z;
  l2.x = l.x/2.0;
  l2.y = l.y/2.0;
  l2.z = l.z/2.0;  
  c.x=l.x/(float)(mx);
  c.y=l.y/(float)(my);
  c.z=l.z/(float)(mz);
  scalcoord<<<blocks,threads>>>(sfr,r_d);
  intstep1<<<blocks,threads>>>(l,l2,cr_d,r_d,v_d,a_d); 
  cudaMemsetAsync(npcd,0,ncel*sizeof(int));
  celli<<<blocks,threads>>>(c,l2,r_d,celld);
  hist<<<blocks,threads>>>(npcd,lstcd,celld);   
  
  //calculate_forcesb<<<blocks,threads>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);
  
  //forces_shrange<<<ncel,tpb>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);
  
  alt_forces_shrange<<<ncel,newtpb>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);
  
  intstep2<<<blocks,threads>>>(cr_d,v_d,a_d);
  sumvec<<<blocks,threads>>>(w_d,c1,n);
  sumvec<<<blocks,threads>>>(c1,c2,blocks);
  cudaMemcpy(&w,c2,sizeof(float),cudaMemcpyDeviceToHost);
  ekini<<<blocks,threads>>>(v_d, k_d);
  sumvec<<<blocks,threads>>>(k_d,c1,n);
  sumvec<<<blocks,threads>>>(c1,c2,blocks);
  cudaMemcpy(&ekin,c2,sizeof(float),cudaMemcpyDeviceToHost);
  temp=ekin/(3.0*float(n)); 
  vol=l.x*l.y*l.z;
  pres=(float(n)*temp+w/6.0)/vol;
  printf("%i\t%f\t%f\t%f\t%f\n",i,n/vol,temp,pres,sfr); 
  if (i%300==0)
{ 
  scalcoord<<<blocks,threads>>>(sfv,v_d);  
}

  if (i%1000==0)
{ cudaMemcpy(r,r_d,n*sizeof(float3),cudaMemcpyDeviceToHost);
  cudaMemcpy(v,v_d,n*sizeof(float3),cudaMemcpyDeviceToHost);
  c_confout(seed,i,l,r);
  c_velout(seed,i,v);  
}

}
exit(0);

  cudaFree(r_d);
  cudaFree(v_d);
  cudaFree(a_d);
}

So make it easy for me. When I run your code, the output is nan immediately.
It’s now been running for thousands of iterations.

Does it need to run that long? Strip your code down to the minimum necessary to demonstrate the problem.
If you have isolated the error to a single kernel, build a code around just that kernel, supply the necessary input, and define the exact output you expect.

Hello0,
Thank you for looking into my problem. There was quite a lot of information in the first message.

The main program has a loop. In order to run only one iteration replace:

}
exit(0);

with

exit(0); 
}

This way it will run only one iteration.

There are three functions and one can switch easily between them:

//calculate_forcesb<<<blocks,threads>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);
  
  //forces_shrange<<<ncel,tpb>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);
  
  alt_forces_shrange<<<ncel,newtpb>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);
exit(0);

If run with the first two functions I get the same numbers no matter the system. If I run the last function the correct out put is only when i use cuda 8.0. I have different computers with different cards running Ubuntu 16.04 (1 GTX 1080, 1 GTX 980 and 1 GT 750). When I compile it with -g -G the program does not work any more. Now it outputs nans.

I have access to other computers running cuda 6.5 and cuda 7.5 and the offending function now it puts out wrong numbers like around -25 in the fourth column where it should be -3.06. These computers one has a 750m card and one a Tesla 40c.

Your reduction is wrong and is reading uninitialized shared memory.
You have 256 threads. Line 756 initializes only the first 256 entries of your shacc shared memory array.

Your reduction in lines 769-779 is incorrect. For example, On your first pass through, line 773 has thread tid=255 read (uninitalized) index 255+256=511.

You could change lines 759+ to be similar to:

for(int l=512;l>=1;l=l/2)
   {
     if(tid >= l)
       {
         shacc[tid-l].x+=shacc[tid].x;
...

I don’t know if this bug is the only cause of your problems, but it is one problem.

Thanks for yuor reply

So you think the following code which is a few lines above does not initialize everything to 0?

float4 acc={0.0,0.0,0.0,0.0};    
        shacc[tid]=acc;
        __syncthreads();

Also the number of threads the specific function is 864 (newtpb).

The problem with my reduction is that the share memory is not power of two so first i have to reduce all the values which are in teh array over 512.

Besides the program gives the right numbers when I compile in release mode.

It looks to me like there are 864 threads:

#define newtpb 864
...
alt_forces_shrange<<<ncel,newtpb>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);

There are two code posts from OP. My comments and line numbers refer to the most recent updated post which has “#define threads 256” on line 14, and invokes kernels with <<<blocks,threads>>>

Edit: ah, I see, the alt_forces call has a different launch bounds than the other kernel calls. Carry on. :-)

You referred to lines 756 to 779. There is only one code posting on this page that has those line numbers. Those line numbers in that posting are part of the alt_forces_shrange kernel whose definition starts on line 680. That kernel is invoked on line 1066 of that posting. The invocation on line 1066 of that posting specifies newtbp as the threads-per-block kernel configuration argument. newtbp is a define of 864

Hello,

Thanks for looking over, there are three functions with different calling configurations. one of them uses threads another one use tpb and the last kernel which has reduction uses newtpb (=864=9*32).

864 does not equal 9 times 32

Sorry,

newtpb=864=32*27;

I understand you think maybe the code is correct because you found one machine configuration where it works correctly. Unfortunately for me, that doesn’t give me much guidance as to how to find out why it is not working correctly elsewhere.

Debugging in this case, although tedious, doesn’t have to be conceptually difficult. Your code is producing nan outputs. Grab the tiger by the tail and see where it takes you. What I mean by that is aggressively track backwards e.g. using code instrumented with isnan():

http://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE_1gf8093cd7c372f91c9837a82fd368c711

to find out where and under what circumstances the nan values are entering the computation stream. Since the only code that is changing is the aforementioned kernel, I put some isnan tests at the output of that kernel and sure enough it is writing nan from shared out to global memory.

I next moved my isnan test to the acc output of the newbodyBodyInteraction function, and sure enough nan values are emanating from that function as well. So I can continue this process, but you could do it too. This is the type of debugging legwork that I have come to expect with problems like this (especially when I didn’t write the code and I have no direct knowledge of what it is supposed to be doing and what the underlying arithmetic should be). Probably, you should accept the burden of debugging your own code in this fashion. I don’t think the datapoint that it runs differently on different machines is much use. In my case I am running on cuda 8 (8.0.61) in release mode on a Tesla K20X and it is failing. So even your CUDA 8 datapoint probably has some nuances to it.

Even if we wanted to postulate a bug in CUDA (say, defective code generation for every situation other than CUDA 8 sm_61 code, or whatever) the proof of that could only come through such a debugging process as above in the failing case.

I’ve already run cuda-memcheck plus the initcheck and racecheck subtools, with no errors reported, so I don’t think there are obvious coding errors here. I suspect the nan are occurring through some unexpected data combination.

Hello,

I am not sure what happens, but there are three machines in which the code works. With different cards from different generations but with the same version of nvidia driver and cuda.

I made some change to the code instead of running 864 threads now it runs 288, but now I have less parallelism because one thread does the work (loads and computing) of three threads. Now the code works without problem all my computers I have access to.

Thanks for the suggestions I will definitely try the isna() function. The best suggestion I saw in long time. I want to know what is happening. If the code fails on you configuration I suppose there is some problem I can not find and i will not be abe to sleep.

You have unguarded divisions in your bodyBodyInteraction functions. You need to decide how you want to handle the rij==0 case.

You could instead use the potential of a homogeneous spherical body, which switches from 1/r to r^2 (plus a constant to make the potential continuous) at it’s surface. Or use a “softening” like in the nbody example that comes with CUDA. But you need to handle the divergence somehow, or you risk obtaining nonsensical or NaN results.

Out of curiosity: What are the thoughts behind this gem?

rij.x=(rij.x>=l2.x?rij.x-=l.x:rij.x=rij.x);
rij.x=(rij.x<-l2.x?rij.x+=l.x:rij.x=rij.x);

Hello,

Thank you for looking at my problem. Yes the divisions with zero could be a problem, but the physical model prevents that. If the particles get so close together that 1/r is extremely large,it means that our numerical algorithm is unstable (if it is corrected implemented just the time step dt must be lowered). Also I have other versions which already have been tested, so in our simulation r will never get too small. The single precision might be a problem, but this we will be tested first just by doing a replace all float with double.

When we calculate the distances between particles we use periodic boundary conditions, which means we have copies of the same system at each boundary and corner. When we compute the force between two particles we have to take the distance to or particle or its image over the boundary whichever its the lowest. Particles close to the left boundary will interact with particles close the opposite side.

The original code was:

if(rij.x>=l2.x) rij.x-=l.x;
if(rij.x<-l2.x) rij.x+=l.x;

In order to make the code faster I suggested to the other people in the project to do this:

rij.x=(rij.x>=l2.x?rij.x-l.x:rij.x);
rij.x=(rij.x<-l2.x?rij.x+l.x:rij.x);

Last I did that the code was 2 times faster.

However there must have been some misunderstanding and it resulted in something else. Since it worked I did no bother to change it.

At this point just a short summary. I have a function which in release with cuda 8, but not with debug other versions. So far the results I see are correct and in agreement with alternative functions doing the same thing. I thought someone else get this kind of problem before. The kernel is quite heavy and uses lots of registers.

For the physical problem. We are doing molecular dynamics. For this we set a rectangular box and divided in cubes of equal size. Sorting the operation in cubes scales like N, so it is pretty fast. If I pick a particle in a cube i have to check if it interacts with the particles in its own cube and all the particles in the surrounding cubes (26+1 cubes in total). Based on the physical parameters we know that teach cube has 32 particle so I set the each block to have 32*27 threads and each thread is doing one ij pair or particles.

I’ve instrumented the code as below using the isnan/grabbing the tiger by the tail method, and I came to the same lines of code that tera pointed out (the gem: I really don’t understand it either) as being the “injection point” for nan into the computation stream.

Here is the first part of the output I get, on CUDA 8.0.61, CentOS 7, Tesla K20X:

$ ./t956 2>&1 |more
25.398417       25.398417       25.398417
24575.970703    0.499999
16384.000000    -3.186736
0       1.000987        inf     inf     0.999671
t956.cu:102: void my_assert(__nv_bool): block: [8,0,0], thread: [221,0,0] Assert
ion `cond` failed.
t956.cu:102: void my_assert(__nv_bool): block: [8,0,0], thread: [93,0,0] Asserti
on `cond` failed.
t956.cu:102: void my_assert(__nv_bool): block: [53,0,0], thread: [95,0,0] Assert
ion `cond` failed.
t956.cu:102: void my_assert(__nv_bool): block: [8,0,0], thread: [29,0,0] Asserti
on `cond` failed.
t956.cu:102: void my_assert(__nv_bool): block: [8,0,0], thread: [31,0,0] Asserti
on `cond` failed.
nan58: inf, inf, inf, nan, nan
nan58: 0.000000, 0.000000, inf, nan, nan
nan58: 0.000000, 0.000000, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: 0.000000, inf, inf, nan, nan
nan58: 0.000000, 0.000000, inf, nan, nan
nan58: 0.000000, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: 0.000000, 0.000000, inf, nan, nan
nan58: 0.000000, 0.000000, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: 0.000000, 0.000000, 0.000000, 0.000000, 0.000000
nan58: 0.000000, 0.000000, 0.000000, nan, nan
nan58: 0.000000, 0.000000, inf, nan, nan
nan58: 0.000000, 0.000000, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: inf, inf, inf, nan, nan
nan58: 0.000000, 0.000000, 0.000000, 0.000000, 0.000000
nan58: 0.000000, 0.000000, 0.000000, 0.000000, 0.000000

Here is the “instrumented” code:

$ cat t956.cu
// version 01.08.16
// Pair potential
// nvcc -arch=sm_20 -ptxas-options=-v
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include <fstream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <assert.h>
using namespace std;
#define threads 256
#define tpb 32
#define newtpb 864

const int nx=16;
const int ny=16;
const int nz=16;
const int n=4*nx*ny*nz;
const int blocks = (int)((n + threads - 1) / threads);
float rho=1.00;
float lc=pow(4.0/rho,1.0/3.0);
float lx=nx*lc;
float ly=ny*lc;
float lz=nz*lc;
float rv=1.0*2.50;
const float rc=2.50;
const float rc2=rc*rc;
const float csr=0.0010;
const float qp=0.010;
const float dt=0.010;
const float cv=dt/2.0;
const float cf=dt*dt/2.0;
const int mx=(int)(lx/rv);
const int my=(int)(ly/rv);
const int mz=(int)(lz/rv);
const int nmaxc=tpb;
const int ncel=mx*my*mz;
float cx=lx/(float)(mx);
float cy=ly/(float)(my);
float cz=lz/(float)(mz);
__constant__ float rcd;
__constant__ float rc2d;
__constant__ float csrd;
__constant__ float qpd;
__constant__ float dtd;
__constant__ float cvd;
__constant__ float cfd;
__constant__ int mxd;
__constant__ int myd;
__constant__ int mzd;
__constant__ int nmaxcd;
__constant__ int nceld;

//===========================================================================
// kernel function: particle positions in fcc
//===========================================================================

void c_fcc(float d,float3 l,float3 *r)
{
  float d2=d/2.0;
  float3 l2={l.x/2.0, l.y/2.0, l.z/2.0};
  int s=0;
  for (int k=0;k<nz;k++)
{ for (int i=0;i<nx;i++)
{ for (int j=0;j<ny;j++)
{ r[s].x=-l2.x+float(i)*d+d2;
  r[s].y=-l2.y+float(j)*d+d2;
  r[s].z=-l2.z+float(k)*d+d2;
  s=s+1;
  r[s].x=-l2.x+float(i)*d+d;
  r[s].y=-l2.y+float(j)*d+d;
  r[s].z=-l2.z+float(k)*d+d2;
  s=s+1;
  r[s].x=-l2.x+float(i)*d+d;
  r[s].y=-l2.y+float(j)*d+d2;
  r[s].z=-l2.z+float(k)*d+d;
  s=s+1;
  r[s].x=-l2.x+float(i)*d+d2;
  r[s].y=-l2.y+float(j)*d+d;
  r[s].z=-l2.z+float(k)*d+d;
  s=s+1;
} } }

  for (int i=0;i<s;i++)
{ r[i].x=(r[i].x>=l2.x?r[i].x-=l.x:r[i].x=r[i].x);
  r[i].x=(r[i].x<-l2.x?r[i].x+=l.x:r[i].x=r[i].x);
  r[i].y=(r[i].y>=l2.y?r[i].y-=l.y:r[i].y=r[i].y);
  r[i].y=(r[i].y<-l2.y?r[i].y+=l.y:r[i].y=r[i].y);
  r[i].z=(r[i].z>=l2.z?r[i].z-=l.z:r[i].z=r[i].z);
  r[i].z=(r[i].z<-l2.z?r[i].z+=l.z:r[i].z=r[i].z);
}

}

//===========================================================================
// kernel function: initial velocities
//===========================================================================
__device__ void my_assert(bool cond){ assert(cond);}
void c_vel(int seed, float temp, float3 *v)
{
  float3 sumv;
  sumv.x=0.0;
  sumv.y=0.0;
  sumv.z=0.0;
  srand(seed);
  for (int i=0;i<n;i++)
{ v[i].x=(float)rand()/(float)RAND_MAX-0.5;
  sumv.x+=v[i].x;
  v[i].y=(float)rand()/(float)RAND_MAX-0.5;
  sumv.y+=v[i].y;
  v[i].z=(float)rand()/(float)RAND_MAX-0.5;
  sumv.z+=v[i].z;
  }
  sumv.x=sumv.x/float(n);
  sumv.y=sumv.y/float(n);
  sumv.z=sumv.z/float(n);
  float3 sumv2;
  sumv2.x=0.0;
  sumv2.y=0.0;
  sumv2.z=0.0;

  for (int i=0;i<n;i++)
{ v[i].x-=sumv.x;
  v[i].y-=sumv.y;
  v[i].z-=sumv.z;
  sumv2.x+=v[i].x*v[i].x;
  sumv2.y+=v[i].y*v[i].y;
  sumv2.z+=v[i].z*v[i].z;
  }
  float fs=sqrt(3.0*float(n)*temp/(sumv2.x+sumv2.y+sumv2.z));
  for (int i=0;i<n;i++)
{ v[i].x=v[i].x*fs;
  v[i].y=v[i].y*fs;
  v[i].z=v[i].z*fs;
  }
}

//===========================================================================
// kernel function: total forces on CPU
//===========================================================================

void cpu_forces(float3 l2, float3 l, float3 *r, float3 *a)
{
  for (int i=0;i<n;i++)
{ float3 bi=r[i];
  float3 ai={0.0, 0.0, 0.0};
  for (int j=0;j<n;j++)
{ float3 bj=r[j];
  float3 rij;
  rij.x = bi.x - bj.x;
  rij.y = bi.y - bj.y;
  rij.z = bi.z - bj.z;
  rij.x=(rij.x>=l2.x?rij.x-=l.x:rij.x=rij.x);
  rij.x=(rij.x<-l2.x?rij.x+=l.x:rij.x=rij.x);
  rij.y=(rij.y>=l2.y?rij.y-=l.y:rij.y=rij.y);
  rij.y=(rij.y<-l2.y?rij.y+=l.y:rij.y=rij.y);
  rij.z=(rij.z>=l2.z?rij.z-=l.z:rij.z=rij.z);
  rij.z=(rij.z<-l2.z?rij.z+=l.z:rij.z=rij.z);
  float rij2 = rij.x * rij.x + rij.y * rij.y + rij.z * rij.z;
  if (rij2<rc2&i!=j)
{ float absrij=sqrt(rij2);
  float rmrc= absrij-rc;
  float rmrc2=rmrc*rmrc;
  float rmrc4=rmrc2*rmrc2;
  float irij2 = 1.0/rij2;
  float irij6 = irij2 * irij2 * irij2;
  float ur = 4.0 * irij6 * (irij6-1.0);
  float u1r = -48.0 * irij6 * (irij6 - 0.50) * irij2;;
  float sr=rmrc4/(csr+rmrc4);
  float s1r=4.0*csr*rmrc*rmrc2/((csr+rmrc4)*(csr+rmrc4)*absrij);
  float fij=-(ur*s1r+u1r*sr);
  ai.x+= rij.x * fij;
  ai.y+= rij.y * fij;
  ai.z+= rij.z * fij;
} }
a[i]=ai;
} }

//===========================================================================
// device function: pair interactions
//===========================================================================

__device__ float4
bodyBodyInteractionx(float3 l2, float3 l, float3 bi, float3 bj, float4 ai)
{
  float3 rij;
  rij.x = bi.x - bj.x;
  rij.y = bi.y - bj.y;
  rij.z = bi.z - bj.z;
  rij.x=(rij.x>=l2.x?rij.x-=l.x:rij.x=rij.x);
  rij.x=(rij.x<-l2.x?rij.x+=l.x:rij.x=rij.x);
  rij.y=(rij.y>=l2.y?rij.y-=l.y:rij.y=rij.y);
  rij.y=(rij.y<-l2.y?rij.y+=l.y:rij.y=rij.y);
  rij.z=(rij.z>=l2.z?rij.z-=l.z:rij.z=rij.z);
  rij.z=(rij.z<-l2.z?rij.z+=l.z:rij.z=rij.z);
  float rij2 = rij.x * rij.x + rij.y * rij.y + rij.z * rij.z;
  if (rij2<rc2d)
{ float irij2=1.0/rij2;
  float irij6 = irij2 * irij2 * irij2;
  float fij = 48.0 * irij6 * (irij6 - 0.50) * irij2;;
  ai.x+= rij.x * fij;
  ai.y+= rij.y * fij;
  ai.z+= rij.z * fij;
  ai.w+= rij2 * fij;
 }
  return ai;
}
//===========================================================================
// device function: pair interactions
//===========================================================================

__device__ float4
bodyBodyInteraction(float3 l2, float3 l, float3 bi, float3 bj, float4 ai)
{
  float3 rij;
  rij.x = bi.x - bj.x;
  rij.y = bi.y - bj.y;
  rij.z = bi.z - bj.z;
  rij.x=(rij.x>=l2.x?rij.x-=l.x:rij.x=rij.x);
  rij.x=(rij.x<-l2.x?rij.x+=l.x:rij.x=rij.x);
  rij.y=(rij.y>=l2.y?rij.y-=l.y:rij.y=rij.y);
  rij.y=(rij.y<-l2.y?rij.y+=l.y:rij.y=rij.y);
  rij.z=(rij.z>=l2.z?rij.z-=l.z:rij.z=rij.z);
  rij.z=(rij.z<-l2.z?rij.z+=l.z:rij.z=rij.z);
  float rij2 = rij.x * rij.x + rij.y * rij.y + rij.z * rij.z;
  if (rij2<rc2d)
{ float irij2=1.0/rij2;
  float irij6 = irij2 * irij2 * irij2;
  float absrij = sqrt(rij2);
  float rmrc= absrij-rcd;
  float rmrc2=rmrc*rmrc;
  float rmrc4=rmrc2*rmrc2;
  float ur = 4.0 * irij6 * (irij6-1.0);
  float u1r = -48.0 * irij6 * (irij6 - 0.50) * irij2;;
  float sr = rmrc4/(csrd+rmrc4);
  float s1r = 4.0*csrd*rmrc*rmrc2/((csrd+rmrc4)*(csrd+rmrc4)*absrij);
  float fij= -(ur*s1r+u1r*sr);
  ai.x+= rij.x * fij;
  ai.y+= rij.y * fij;
  ai.z+= rij.z * fij;
  ai.w+= rij2 * fij;
 }
  return ai;
}

__device__ float4
newbodyBodyInteraction(float3 l2, float3 l, float3 bi, float3 bj)
{
  float3 rij;
  float4 ai;
  rij.x = bi.x - bj.x;
  rij.y = bi.y - bj.y;
  rij.z = bi.z - bj.z;
  rij.x=(rij.x>=l2.x?rij.x-=l.x:rij.x=rij.x);
  rij.x=(rij.x<-l2.x?rij.x+=l.x:rij.x=rij.x);
  rij.y=(rij.y>=l2.y?rij.y-=l.y:rij.y=rij.y);
  rij.y=(rij.y<-l2.y?rij.y+=l.y:rij.y=rij.y);
  rij.z=(rij.z>=l2.z?rij.z-=l.z:rij.z=rij.z);
  rij.z=(rij.z<-l2.z?rij.z+=l.z:rij.z=rij.z);
  float rij2 = rij.x * rij.x + rij.y * rij.y + rij.z * rij.z;
  if (rij2<rc2d)
{ float irij2=1.0/rij2;
  float irij6 = irij2 * irij2 * irij2;
  float absrij = sqrt(rij2);
  float rmrc= absrij-rcd;
  float rmrc2=rmrc*rmrc;
  float rmrc4=rmrc2*rmrc2;
  float ur = 4.0 * irij6 * (irij6-1.0);
  float u1r = -48.0 * irij6 * (irij6 - 0.50) * irij2;;
  float sr = rmrc4/(csrd+rmrc4);
  float s1r = 4.0*csrd*rmrc*rmrc2/((csrd+rmrc4)*(csrd+rmrc4)*absrij);
  float fij= -(ur*s1r+u1r*sr);
  ai.x= rij.x * fij;
  ai.y= rij.y * fij;
  ai.z= rij.z * fij;
  ai.w= rij2 * fij;
 }
  return ai;
}

//===========================================================================
// device function: forces in GPU
//===========================================================================

__global__ void
calculate_forces(float3 l2, float3 l, float3 *r_d, float3 *a_d, float *w_d)
{
  __shared__ float3 shPosition[threads];
  float3 myPosition;
  float4 acc={0.0, 0.0, 0.0, 0.0};
  int gtid = blockIdx.x * blockDim.x + threadIdx.x;
  myPosition = r_d[gtid];
  for (int tile = 0; tile < blocks; tile++)
  {
    int idx = tile * threads + threadIdx.x;
    shPosition[threadIdx.x] = r_d[idx];
    __syncthreads();
  for (int j = 0; j < threads; j++)
  { if(gtid!=j + tile * threads)
    {
    acc = bodyBodyInteraction(l2 ,l ,myPosition, shPosition[j], acc);
    }
  }
    __syncthreads();
  }
  // Save the result in global memory for the integration step.
//  float4 acc3 = {acc.x, acc.y, acc.z, acc.w};
  a_d[gtid].x = acc.x;
  a_d[gtid].y = acc.y;
  a_d[gtid].z = acc.z;
  w_d[gtid] = acc.w;
}

//===========================================================================
// cuda: first step velocity Verlet's integration
//===========================================================================

__global__ void intstep1(float3 l, float3 l2, float *cr_d,
 float3 *r_d, float3 *v_d, float3 *a_d)
{
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  float3 ri, vi, ai;
  ri=r_d[i];
          if (isnan(ri.x)) {printf("nan40\n"); my_assert(0);}
          if (isnan(ri.y)) {printf("nan41\n"); my_assert(0);}
          if (isnan(ri.z)) {printf("nan42\n"); my_assert(0);}
  ai=a_d[i];
          if (isnan(ai.x)) {printf("nan43\n"); my_assert(0);}
          if (isnan(ai.y)) {printf("nan44\n"); my_assert(0);}
          if (isnan(ai.z)) {printf("nan45\n"); my_assert(0);}
  vi=v_d[i];
          if (isnan(vi.x)) {printf("nan46\n"); my_assert(0);}
          if (isnan(vi.y)) {printf("nan47\n"); my_assert(0);}
          if (isnan(vi.z)) {printf("nan48\n"); my_assert(0);}
  float cri=cr_d[i];
          if (isnan(cri)) {printf("nan49\n"); my_assert(0);}
  ri.x+=cri*(vi.x*dtd+ai.x*cfd);
          if (isnan(ri.x)) {printf("nan52\n"); my_assert(0);}
  ri.y+=cri*(vi.y*dtd+ai.y*cfd);
          if (isnan(ri.y)) {printf("nan53\n"); my_assert(0);}
  ri.z+=cri*(vi.z*dtd+ai.z*cfd);
          if (isnan(ri.z)) {printf("nan54\n"); my_assert(0);}
  vi.x+=cri*ai.x*cvd;
          if (isnan(vi.x)) {printf("nan55\n"); my_assert(0);}
  vi.y+=cri*ai.y*cvd;
          if (isnan(vi.y)) {printf("nan56\n"); my_assert(0);}
  vi.z+=cri*ai.z*cvd;
          if (isnan(vi.y)) {printf("nan57\n"); my_assert(0);}
  float my_rix = ri.x;
  float my_l2x = l2.x;
  float my_lx = l.x;
  ri.x=(ri.x>=l2.x?ri.x-=l.x:ri.x=ri.x);
          if (isnan(ri.x)) {printf("nan58: %f, %f, %f, %f, %f\n", my_rix, my_l2x, my_lx, float(my_rix-=my_lx), float(my_rix=my_rix)); my_assert(0);}
  ri.x=(ri.x<-l2.x?ri.x+=l.x:ri.x=ri.x);
          if (isnan(ri.x)) {printf("nan59\n"); my_assert(0);}
  ri.y=(ri.y>=l2.y?ri.y-=l.y:ri.y=ri.y);
          if (isnan(ri.y)) {printf("nan60\n"); my_assert(0);}
  ri.y=(ri.y<-l2.y?ri.y+=l.y:ri.y=ri.y);
          if (isnan(ri.y)) {printf("nan61\n"); my_assert(0);}
  ri.z=(ri.z>=l2.z?ri.z-=l.z:ri.z=ri.z);
          if (isnan(ri.z)) {printf("nan62\n"); my_assert(0);}
  ri.z=(ri.z<-l2.z?ri.z+=l.z:ri.z=ri.z);
          if (isnan(ri.z)) {printf("nan63\n"); my_assert(0);}
  r_d[i]=ri;
  v_d[i]=vi;
}

//===========================================================================
// cuda: second step of velocity Verlet's integration
//===========================================================================

__global__ void intstep2(float *cr_d,float3 *v_d, float3 *a_d)
{ int i=blockIdx.x*blockDim.x+threadIdx.x;
  float3 vi=v_d[i];
  float3 ai=a_d[i];
  float cri=cr_d[i];
  vi.x+=cri*ai.x*cvd;
  vi.y+=cri*ai.y*cvd;
  vi.z+=cri*ai.z*cvd;
  v_d[i]=vi;
}

//===========================================================================
// cuda: first step velocity Verlet's integration
//===========================================================================

__global__ void intstep1z(float sfr, float3 l, float3 l2, float *cr_d,
 float3 *r_d, float3 *v_d, float3 *a_d)
{
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  float3 ri, vi, ai;
  ri=r_d[i];
  ai=a_d[i];
  vi=v_d[i];
  float cri=cr_d[i];
  ri.z=((sfr-1.0)*cri+1.0)*ri.z;
  ri.x+=cri*(vi.x*dtd+ai.x*cfd);
  ri.y+=cri*(vi.y*dtd+ai.y*cfd);
  ri.z+=cri*(vi.z*dtd+ai.z*cfd);
  vi.x+=cri*ai.x*cvd;
  vi.y+=cri*ai.y*cvd;
  vi.z+=cri*ai.z*cvd;
  ri.x=(ri.x>=l2.x?ri.x-=l.x:ri.x=ri.x);
  ri.x=(ri.x<-l2.x?ri.x+=l.x:ri.x=ri.x);
  ri.y=(ri.y>=l2.y?ri.y-=l.y:ri.y=ri.y);
  ri.y=(ri.y<-l2.y?ri.y+=l.y:ri.y=ri.y);
  ri.z=(ri.z>=l2.z?ri.z-=l.z:ri.z=ri.z);
  ri.z=(ri.z<-l2.z?ri.z+=l.z:ri.z=ri.z);
  r_d[i]=ri;
  v_d[i]=vi;
}

//===========================================================================
// kernel function: write configuration
//===========================================================================

void c_confout
(int ist,int i,float3 l,float3 *r)
{
  char filename[256];
  ofstream file;
  sprintf(filename,"conf%d.%d",ist,i);
  file.open(filename);
{ file << l.x << '\t' << l.y << '\n';}
  for (int k=0;k<n;k++)
  {
  {file << r[k].x << '\t' << r[k].y << '\t' << r[k].z << '\n';}
  }
  file.close();
}

//===========================================================================
// kernel function: write velocities
//===========================================================================

void c_velout
(int ist, int i, float3 *v)
{
  char filename[256];
  ofstream file;
  sprintf(filename,"vel%d.%d",ist,i);
  file.open(filename);
  for (int k=0;k<n;k++)
  {
  {file << v[k].x << '\t' << v[k].y << '\t' << v[k].z << '\n';}
  }
  file.close();
}

//===========================================================================
// kernel function: read configuration
//===========================================================================

void c_confin
(int ist, int i, float3 l, float3 *r)
{
  char filename[256];
  ifstream file;
  sprintf(filename,"conf%d.%d",ist,i);
  file.open(filename);
  file >> l.x >> l.y >> l.z ;
  for (int k=0;k<n;k++)
  {
  file >> r[k].x >> r[k].y >> r[k].z ;
  }
  file.close();
}

//===========================================================================
// kernel function: read velocities
//===========================================================================

void c_velin
(int ist, int i, float3 *v)
{
  char filename[256];
  ifstream file;
  sprintf(filename,"vel%d.%d",ist,i);
  file.open(filename);
  for (int k=0;k<n;k++)
  {
  file >> v[k].x >> v[k].y >> v[k].z;
  }
  file.close();
}

//===========================================================================
// cuda: (twice) the particles kinetic energy
//===========================================================================

__global__ void ekini(float3 *v_d,float *k_d)
{ int i=blockIdx.x*blockDim.x+threadIdx.x;
  k_d[i]=v_d[i].x*v_d[i].x+v_d[i].y*v_d[i].y+v_d[i].z*v_d[i].z;
}

//===========================================================================
// device function: pair interactions
//===========================================================================

__device__ float
bodyBodyPotential(float3 l2, float3 l, float3 bi, float3 bj, float ui)
{
  float3 rij;
  ui=0.0;
  rij.x = bi.x - bj.x;
  rij.y = bi.y - bj.y;
  rij.z = bi.z - bj.z;
  rij.x=(rij.x>=l2.x?rij.x-=l.x:rij.x=rij.x);
  rij.x=(rij.x<-l2.x?rij.x+=l.x:rij.x=rij.x);
  rij.y=(rij.y>=l2.y?rij.y-=l.y:rij.y=rij.y);
  rij.y=(rij.y<-l2.y?rij.y+=l.y:rij.y=rij.y);
  rij.z=(rij.z>=l2.z?rij.z-=l.z:rij.z=rij.z);
  rij.z=(rij.z<-l2.z?rij.z+=l.z:rij.z=rij.z);
  float rij2 = rij.x * rij.x + rij.y * rij.y + rij.z * rij.z;
  if (rij2<rc2d)
{ float irij2 = 1.0/rij2;
  float irij6 = irij2 * irij2 * irij2;
  float rmrc = sqrt(rij2)-rcd;
  float rmrc2 = rmrc*rmrc;
  float rmrc4 = rmrc2*rmrc2;
  float sr = rmrc4/(csrd+rmrc4);
  float uij = 4.0 * sr * irij6 * (irij6-1.0);
  ui+= uij;
 }
  return ui;
}

//===========================================================================
// device function: forces in GPU
//===========================================================================

__global__ void
epoti(float3 l2, float3 l, float3 *r_d, float *u_d)
{
  __shared__ float3 shPosition[threads];
  float3 myPosition;
  float ui = 0.0;
  int gtid = blockIdx.x * blockDim.x + threadIdx.x;
  myPosition = r_d[gtid];
  for (int tile = 0; tile < blocks; tile++)
  {
    int idx = tile * threads + threadIdx.x;
    shPosition[threadIdx.x] = r_d[idx];
    __syncthreads();
  for (int j = 0; j < threads; j++)
  { if(gtid!=j + tile * threads)
    {
    ui = bodyBodyPotential(l2 ,l ,myPosition, shPosition[j], ui);
    }
  }
    __syncthreads();
  }
  // Save the result in global memory
  u_d[gtid] = ui;
}

//===========================================================================
// device function: sum elements of vector a (reduction)
//===========================================================================

__global__ void sumvec(float *a, float *c, int blocks)
{
__shared__ float cache[threads];
int tid=threadIdx.x+blockIdx.x*blockDim.x;
int cacheIndex=threadIdx.x;
float temp=0.0;
while (tid<blocks)
{
temp +=a[tid];
tid+=blockDim.x*gridDim.x;
}
cache[cacheIndex]=temp;
__syncthreads();
int i=blockDim.x/2;
while (i!=0)
{
if (cacheIndex<i)
cache[cacheIndex]+=cache[cacheIndex+i];
__syncthreads();
i/=2;
}
if (cacheIndex==0)
c[blockIdx.x]=cache[0];
}

//===========================================================================
// device: scale coordinates by a factor sf
//===========================================================================

__global__ void scalcoord(float sf,float3 *v_d)
{
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  v_d[i].x=v_d[i].x*sf;
  v_d[i].y=v_d[i].y*sf;
  v_d[i].z=v_d[i].z*sf;
}

//===========================================================================
// device: define values 0 or 1
//===========================================================================

__global__ void setval(float a, float b, float3 *r_d, float *cr_d)
{
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  cr_d[i]=1.0;
  if ((r_d[i].z>a)&&(r_d[i].z<b))
  cr_d[i]=0.0;
}

//===========================================================================
// cuda function: forces (version b)
//===========================================================================

__global__ void calculate_forcesb
(float3 l2, float3 l, int* mapd, int* celld,
int* npcd, int* lstcd, float3 *r_d, float3 *a_d, float *w_d)
{ int i=blockIdx.x*blockDim.x+threadIdx.x;
  float4 acc={0.0, 0.0, 0.0, 0.0};
  float3 myPosition = r_d[i];
  int offseti=celld[i]*27;
  for (int t=0;t<27;t++)
{ int celj=mapd[offseti+t];
  int offsetj=celj*nmaxcd;
  for (int k=0;k<npcd[celj];k++)
{ int j=lstcd[offsetj+k];
  if (j!=i)
  acc = bodyBodyInteraction(l2 ,l ,myPosition, r_d[j], acc);
}
}
  a_d[i].x = acc.x;
  a_d[i].y = acc.y;
  a_d[i].z = acc.z;
  w_d[i] = acc.w;
}

//===========================================================================
// cuda function: forces
//===========================================================================

__global__ void forces_shrange
(float3 l2, float3 l, int* mapd, int* celld,
int* npcd, int* lstcd, float3 *r_d, float3 *a_d, float *w_d)
{
  int celi=blockIdx.x;
  int offseti=celi*27;
  int tid=threadIdx.x;
  int np_celi=npcd[celi];
  float3 myPosition;
  int myip;
   __shared__ float3 shposjp[tpb]; // nmaxcd must be  defined at beginning

  float4 acc={0.0, 0.0, 0.0, 0.0};
  if(tid<np_celi)
  {
       myip=lstcd[celi*nmaxcd+tid];
       myPosition = r_d[myip];
  }

  for (int t=0;t<27;t++)
  {
        int celj=mapd[offseti+t];
        int offsetj=celj*nmaxcd;
        if(tid<npcd[celj])
        {
                shposjp[tid]=r_d[lstcd[offsetj+tid]];
        }
        __syncthreads();

   /*if(npcd[celi]>32 || npcd[celj]>32)
   {printf("\n Shhhiiiit  \n");}
   */
        for(int j=0;j<npcd[celj];j++)
        {
                if((tid<np_celi) && (celi!=celj|| tid!=j)) //to avoid selfinteraction
                 {
                        acc = bodyBodyInteraction(l2 ,l ,myPosition, shposjp[j], acc);
                 }
        }
        __syncthreads();
   }

   if(tid<np_celi)
  {
        a_d[myip].x = acc.x;
        a_d[myip].y = acc.y;
        a_d[myip].z = acc.z;
        w_d[myip] = acc.w;
  }
}

//===========================================================================
// cuda function: forces, alternate
//===========================================================================

__global__
//__launch_bounds__(newtpb, 2 )
void alt_forces_shrange
(float3 l2, float3 l, int* mapd, int* celld,
int* npcd, int* lstcd, float3 *r_d, float3 *a_d, float *w_d)
{

  // __shared__ float3 shposip[tpb]; // tpb=nmaxcd must be  defined at beginning
  // __shared__ int shin_celli[tpb];
   __shared__ float4 shacc[newtpb];

int celi=blockIdx.x;

   int tid=threadIdx.x;

   int ni_celli=tid/32; //this number is between 0 and 26, the warp id

   int np_celli=npcd[celi]; //number of particles in celli

   int celj=mapd[celi*27+ni_celli]; //index of the local neighboring cell

   int np_cellj=npcd[celj]; //number of particles in cellj

   int myjp=-100;       // some of the threads will do nothing becasue tthere are less than 32 particles

   int s=tid%32; // each thread gets a number between 0 and 31

   //if(s==0 && celi==33)  {printf(" %d %d %d %d %d tid= %d\n", celi,celj,ni_celli,np_celli,np_cellj,tid);}

float3 myPosjp;

   if(s<np_cellj)
   {
      myjp=lstcd[celj*nmaxc+s];
      myPosjp=r_d[myjp];
          if (isnan(myPosjp.x)) {printf("nan20: %d,%d,%d\n", celj,nmaxc,s); my_assert(0);}
          if (isnan(myPosjp.y)) {printf("nan21: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
          if (isnan(myPosjp.z)) {printf("nan22: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
   }
      __syncthreads();

// we load the particles of the celi on the shared memory in each cell at all time

   /*
   if(npcd[celi]>32 || npcd[celj]>32)
   {printf("\n Shhhiiiit  \n");}
   */

   /*

   if(tid<np_celli)
   {
      int ip=lstcd[celi*nmaxcd+tid];
      shin_celli[tid]=ip;
      shposip[tid]=r_d[ip];
   }
   */

   __syncthreads(); //at this point all data should be on in the block variables

for(int iii=0;iii<np_celli;iii++)
   {
        int myip=lstcd[celi*nmaxc+iii]; //        int myip=shin_celli[iii];
        float3 myPosip=r_d[myip];         //float3 myPosip=shposip[iii];
        float4 acc={0.0,0.0,0.0,0.0};
        shacc[tid]=acc;
        __syncthreads();

        if( myjp>=0 && myjp!=myip && s<np_cellj) //to avoid selfinteraction
        {
          if (isnan(myPosip.x)) {printf("nan8: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
          if (isnan(myPosip.y)) {printf("nan9: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
          if (isnan(myPosip.z)) {printf("nan10: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
          if (isnan(myPosjp.x)) {printf("nan11: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
          if (isnan(myPosjp.y)) {printf("nan12: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
          if (isnan(myPosjp.z)) {printf("nan13: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
          if (isnan(l2.x)) {printf("nan14: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
          if (isnan(l2.y)) {printf("nan15: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
          if (isnan(l2.z)) {printf("nan16: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
          if (isnan(l.x)) {printf("nan17: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
          if (isnan(l.y)) {printf("nan18: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
          if (isnan(l.z)) {printf("nan19: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}

                acc= newbodyBodyInteraction(l2 ,l ,myPosip, myPosjp);
          if (isnan(acc.x)) {printf("nan4: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
          if (isnan(acc.y)) {printf("nan5: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
          if (isnan(acc.z)) {printf("nan6: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}
          if (isnan(acc.w)) {printf("nan7: %d,%d\n", threadIdx.x,blockIdx.x); my_assert(0);}

        }
        shacc[tid]=acc;
        __syncthreads();

// reduction on the shared memory
        if(tid>=512)
        {
                shacc[tid-512].x+=shacc[tid].x;
                shacc[tid-512].y+=shacc[tid].y;
                shacc[tid-512].z+=shacc[tid].z;
                shacc[tid-512].w+=shacc[tid].w;
        }
        __syncthreads();

        for(int l=256;l>=1;l=l/2)
        {
                if(tid<l)
                {
                        shacc[tid].x+=shacc[tid+l].x;
                        shacc[tid].y+=shacc[tid+l].y;
                        shacc[tid].z+=shacc[tid+l].z;
                        shacc[tid].w+=shacc[tid+l].w;
                }
                __syncthreads();
        }
        __syncthreads();

        if(tid==0)
        {
        if (isnan(shacc[0].x)) {printf("nan0: %d,%d\n", myip,blockIdx.x); my_assert(0);}
        if (isnan(shacc[0].y)) {printf("nan1: %d,%d\n", myip,blockIdx.x); my_assert(0);}
        if (isnan(shacc[0].z)) {printf("nan2: %d,%d\n", myip,blockIdx.x); my_assert(0);}
        if (isnan(shacc[0].w)) {printf("nan3: %d,%d\n", myip,blockIdx.x); my_assert(0);}
        a_d[myip].x = shacc[0].x;
        a_d[myip].y = shacc[0].y;
        a_d[myip].z = shacc[0].z;
        w_d[myip] = shacc[0].w;
        }
        __syncthreads();
   }
}

__global__
//__launch_bounds__(newtpb, 2 )
void newalt_forces_shrange
(float3 l2, float3 l, int* mapd, int* celld,
int* npcd, int* lstcd, float3 *r_d, float3 *a_d, float *w_d)
{

  // __shared__ float3 shposip[tpb]; // tpb=nmaxcd must be  defined at beginning
  // __shared__ int shin_celli[tpb];
   __shared__ float4 shacc[newtpb];

int celi=blockIdx.x;

   int tid=threadIdx.x;

   int ni_celli=tid/32; //this number is between 0 and 26, the warp id

   int np_celli=npcd[celi]; //number of particles in celli

   int celj=mapd[celi*27+ni_celli]; //index of the local neighboring cell

   int np_cellj=npcd[celj]; //number of particles in cellj

   int myjp=-100;       // some of the threads will do nothing becasue tthere are less than 32 particles

   int s=tid%32; // each thread gets a number between 0 and 31

   //if(s==0 && celi==33)  {printf(" %d %d %d %d %d tid= %d\n", celi,celj,ni_celli,np_celli,np_cellj,tid);}

float3 myPosjp;

   if(s<np_cellj)
   {
      myjp=lstcd[celj*nmaxc+s];
      myPosjp=r_d[myjp];
   }
      __syncthreads();

// we load the particles of the celi on the shared memory in each cell at all time

   /*
   if(npcd[celi]>32 || npcd[celj]>32)
   {printf("\n Shhhiiiit  \n");}

   */

   /*

if(tid<np_celli)
   {
      int ip=lstcd[celi*nmaxcd+tid];

      shin_celli[tid]=ip;
      shposip[tid]=r_d[ip];

   }
   */

   __syncthreads(); //at this point all data should be on in the block variables

for(int iii=0;iii<np_celli;iii++)
   {
        int myip=lstcd[celi*nmaxc+iii]; //        int myip=shin_celli[iii];
        float3 myPosip=r_d[myip];         //float3 myPosip=shposip[iii];
        float4 acc={0.0,0.0,0.0,0.0};
        shacc[tid]=acc;
        __syncthreads();

        if( myjp>=0 && myjp!=myip && s<np_cellj) //to avoid selfinteraction
        {
                acc= newbodyBodyInteraction(l2 ,l ,myPosip, myPosjp);

        }
        shacc[tid]=acc;
        __syncthreads();

// reduction on the shared memory
        if(tid>=512)
        {
                shacc[tid-512].x+=shacc[tid].x;
                shacc[tid-512].y+=shacc[tid].y;
                shacc[tid-512].z+=shacc[tid].z;
                shacc[tid-512].w+=shacc[tid].w;
        }
        __syncthreads();

        for(int l=256;l>=1;l=l/2)
        {
                if(tid<l)
                {
                        shacc[tid].x+=shacc[tid+l].x;
                        shacc[tid].y+=shacc[tid+l].y;
                        shacc[tid].z+=shacc[tid+l].z;
                        shacc[tid].w+=shacc[tid+l].w;
                }
                __syncthreads();
        }
        __syncthreads();

        if(tid==0)
        {
        a_d[myip].x = shacc[0].x;
        a_d[myip].y = shacc[0].y;
        a_d[myip].z = shacc[0].z;
        w_d[myip] = shacc[0].w;
        }
        __syncthreads();
   }
}

//===========================================================================
// cuda: cell index of particles
//===========================================================================

__global__ void celli(float3 c,float3 l2,float3 *r_d,int *celld)
{
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  int mxi,myi,mzi;
  mxi=(int)((r_d[i].x+l2.x)/c.x);
  myi=(int)((r_d[i].y+l2.y)/c.y);
  mzi=(int)((r_d[i].z+l2.z)/c.z);
  if (mxi>=mxd) mxi-=mxd;
  if (myi>=myd) myi-=myd;
  if (mzi>=mzd) mzi-=mzd;
  mxi=(mxi+mxd)%mxd;
  myi=(myi+myd)%myd;
  mzi=(mzi+mzd)%mzd;

  celld[i]=mxi+myi*mxd+mzi*mxd*myd;
}

//===========================================================================
// cuda: list of number and indexes of particles belonging to a given cell
//===========================================================================

__global__ void hist(int *npcd, int *lstcd,int *celld)
{ int j,temp;
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  j=celld[i];
  temp = atomicAdd(&npcd[j],1);
  lstcd[j*nmaxcd+temp]=i;
}

//===========================================================================
// kernel function: map cell and neighbor cell indexes
//===========================================================================

void c_initlcl(int *map)
{ int i,j,k,u,v,w,s;
  for (i=0;i<mx;i++)
{ for (j=0;j<my;j++)
{ for (k=0;k<mz;k++)
{ int imap=((i+mx)%mx+((j+my)%my)*mx+((k+mz)%mz)*mx*my)*27;
  s=0;
  for (u=-1;u<=1;u++)
{ for (v=-1;v<=1;v++)
{ for (w=-1;w<=1;w++)
{ map[imap+s]=((i+u+mx)%mx+((j+v+my)%my)*mx+((k+w+mz)%mz)*mx*my);
  s=s+1;
} } } } } } }

__global__ void nan_kernel(float3 *d, int len, int i){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < len){
          if (isnan(d[idx].x)) {printf("nan30: %d\n", i); my_assert(0);}
          if (isnan(d[idx].y)) {printf("nan31: %d\n", i); my_assert(0);}
          if (isnan(d[idx].z)) {printf("nan32: %d\n", i); my_assert(0);}
  }
}
//===========================================================================
// MAIN PROGRAM
//===========================================================================

int main() {
  cudaSetDevice(0);
  int seed = 0;
  float3 l = {lx, ly, lz};
  float3 r[n], v[n];
  int map[27*ncel];
  float3 *r_d, *v_d, *a_d;
  float *u_d, *k_d, *c1, *c2, *w_d, *cr_d;
  int *mapd, *celld, *npcd, *lstcd;
  float ekin, epot, temp, temp0, w, sfv, pres, vol, sfr, pres0;
  temp0=0.50;
  pres0=0.10;
  float3 l2 = {l.x/2.0, l.y/2.0, l.z/2.0};
  float3 c = {l.x/(float)(mx), l.y/(float)(my), l.z/(float)(mz)};

  cudaMalloc((void**)&r_d, n*sizeof(float3));
  cudaMalloc((void**)&v_d, n*sizeof(float3));
  cudaMalloc((void**)&a_d, n*sizeof(float3));
  cudaMalloc((void**)&u_d, n*sizeof(float));
  cudaMalloc((void**)&k_d, n*sizeof(float));
  cudaMalloc((void**)&w_d, n*sizeof(float));
  cudaMalloc((void**)&c1, n*sizeof(float));
  cudaMalloc((void**)&c2, n*sizeof(float));
  cudaMalloc((void**)&cr_d, n*sizeof(float));
  cudaMalloc((void**)&mapd,27*ncel*sizeof(int));
  cudaMalloc((void**)&lstcd,ncel*nmaxc*sizeof(int));
  cudaMalloc((void**)&npcd,ncel*sizeof(int));
  cudaMalloc((void**)&celld,n*sizeof(int));
  cudaMemcpyToSymbol(rcd,&rc,sizeof(float));
  cudaMemcpyToSymbol(rc2d,&rc2,sizeof(float));
  cudaMemcpyToSymbol(csrd,&csr,sizeof(float));
  cudaMemcpyToSymbol(qpd,&qp,sizeof(float));
  cudaMemcpyToSymbol(dtd,&dt,sizeof(float));
  cudaMemcpyToSymbol(cvd,&cv,sizeof(float));
  cudaMemcpyToSymbol(cfd,&cf,sizeof(float));
  cudaMemcpyToSymbol(mxd,&mx,sizeof(int));
  cudaMemcpyToSymbol(myd,&my,sizeof(int));
  cudaMemcpyToSymbol(mzd,&mz,sizeof(int));
  cudaMemcpyToSymbol(nmaxcd,&nmaxc,sizeof(int));
  cudaMemcpyToSymbol(nceld,&ncel,sizeof(int));

  c_initlcl(map);
  c_fcc(lc,l,r);
  c_vel(seed,temp0,v);

  cudaMemcpy(r_d, r, n*sizeof(float3), cudaMemcpyHostToDevice);
  nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 1);
  cudaMemcpy(v_d, v, n*sizeof(float3), cudaMemcpyHostToDevice);
  cudaMemcpy(mapd,map,27*ncel*sizeof(int),cudaMemcpyHostToDevice);

  cudaMemsetAsync(npcd,0,ncel*sizeof(int));
  celli<<<blocks,threads>>>(c,l2,r_d,celld);
  nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 2);
  hist<<<blocks,threads>>>(npcd,lstcd,celld);
  calculate_forcesb<<<blocks,threads>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);
  nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 3);

  sumvec<<<blocks,threads>>>(w_d,c1,n);
  sumvec<<<blocks,threads>>>(c1,c2,blocks);
  cudaMemcpy(&w,c2,sizeof(float),cudaMemcpyDeviceToHost);
  ekini<<<blocks,threads>>>(v_d, k_d);
  sumvec<<<blocks,threads>>>(k_d,c1,n);
  sumvec<<<blocks,threads>>>(c1,c2,blocks);
  cudaMemcpy(&ekin,c2,sizeof(float),cudaMemcpyDeviceToHost);
  temp=ekin/(3.0*float(n));
  vol=l.x*l.y*l.z;
  pres=(float(n)*temp+w/6.0)/vol;
  printf("%f\t%f\t%f\n",l.x,l.y,l.z);
  printf("%f\t%f\n",ekin,temp);
  printf("%f\t%f\n",vol,pres);
  setval<<<blocks,threads>>>(l.z+1.0,-l.z-1.0,r_d,cr_d);
  nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 4);

//========================================================
// Main-Loop 1:
//========================================================

  for (int i=0;i<10;i++)
{
// propagation of coordinates and velocities
  sfr=1.0+qp*dt*(pres-pres0);
  l.x=sfr*l.x;
  l.y=sfr*l.y;
  l.z=sfr*l.z;
  l2.x = l.x/2.0;
  l2.y = l.y/2.0;
  l2.z = l.z/2.0;
  c.x=l.x/(float)(mx);
  c.y=l.y/(float)(my);
  c.z=l.z/(float)(mz);
  nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 5);
  scalcoord<<<blocks,threads>>>(sfr,r_d);
  nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 6);
  intstep1<<<blocks,threads>>>(l,l2,cr_d,r_d,v_d,a_d);
  nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 7);
  cudaMemsetAsync(npcd,0,ncel*sizeof(int));
  celli<<<blocks,threads>>>(c,l2,r_d,celld);
  nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 8);
  hist<<<blocks,threads>>>(npcd,lstcd,celld);

  //calculate_forcesb<<<blocks,threads>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);

  //forces_shrange<<<ncel,tpb>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);

  alt_forces_shrange<<<ncel,newtpb>>>(l2,l,mapd,celld,npcd,lstcd,r_d,a_d,w_d);
  nan_kernel<<<(n/1024)+1, 1024>>>(r_d, n, 9);

  intstep2<<<blocks,threads>>>(cr_d,v_d,a_d);
  sumvec<<<blocks,threads>>>(w_d,c1,n);
  sumvec<<<blocks,threads>>>(c1,c2,blocks);
  cudaMemcpy(&w,c2,sizeof(float),cudaMemcpyDeviceToHost);
  ekini<<<blocks,threads>>>(v_d, k_d);
  sumvec<<<blocks,threads>>>(k_d,c1,n);
  sumvec<<<blocks,threads>>>(c1,c2,blocks);
  cudaMemcpy(&ekin,c2,sizeof(float),cudaMemcpyDeviceToHost);
  temp=ekin/(3.0*float(n));
  vol=l.x*l.y*l.z;
  pres=(float(n)*temp+w/6.0)/vol;
  printf("%i\t%f\t%f\t%f\t%f\n",i,n/vol,temp,pres,sfr);
  if (i%300==0)
{
  scalcoord<<<blocks,threads>>>(sfv,v_d);
}

  if (i%1000==0)
{ cudaMemcpy(r,r_d,n*sizeof(float3),cudaMemcpyDeviceToHost);
  cudaMemcpy(v,v_d,n*sizeof(float3),cudaMemcpyDeviceToHost);
  c_confout(seed,i,l,r);
  c_velout(seed,i,v);
}

}
exit(0);

  cudaFree(r_d);
  cudaFree(v_d);
  cudaFree(a_d);
}

The nan58 printout occurs at line 361.