emulation working fine but not GPU

My kernel has two arrays nbor_d and count_d,they are assigned values in the kernel (in bold ),these values are calculated correctly when code is run in emulation mode and I printf the values from kernel.When I run the same code on GPU and print values from host code,they are assigned wrong values.Please help.You need not go through the entire code,just have a look over the portion which is in bold, in the code.
My code is as following:

#ifndef KERNEL_H
#define KERNEL_H

#include <stdio.h>

void checkCUDAError(const char* msg);

global void ljondevice(float *Xo_d,float *Yo_d,float *Zo_d,float *ee_d,float xii_h,float yii_h,float zii_h, int mol,int *count_d,int *nbor_d )

{
float sigOO = 3.166f ;
float epsOO = 78.202f ;
float box_x = 36.182f ;
float box_y = 36.182f ;
float box_z = 150.0f ;
float rcutsq_ff = 100.0f ;
float rabvsq = 12.25f ;
// float rovlpsq = 4.0f ;
// float ee_thread,rij ;
//float xii = Xo_d[mol] ;
//float yii = Yo_d[mol] ;
//float zii = Zo_d[mol] ;

device shared float Xos[8] ;
device shared float Yos[8] ;
device shared float Zos[8] ;
device shared int count_s[8] ;
device shared int nbor_s[8] ;

device shared float rij[8] ;
device shared float ee_thread[8] ;

int idx = blockIdx.x*blockDim.x + threadIdx.x ;

int tx = threadIdx.x ;

Xos[tx] = Xo_d[idx] ;
Yos[tx] = Yo_d[idx] ;
Zos[tx] = Zo_d[idx] ;
count_s[tx] = count_d[idx] ;
nbor_s[tx] = nbor_d[idx] ;

__syncthreads() ;

if ((Xos[tx] - xii_h)< 0.0 )
{ Xos[tx]= -1.0*(Xos[tx] - xii_h);
}else{
Xos[tx]= (Xos[tx] - xii_h);
}
if ((Yos[tx] - yii_h)< 0.0 ){
Yos[tx]= -1.0*(Yos[tx] - yii_h);}
else{
Yos[tx]= (Yos[tx] - yii_h);
}

if ((Zos[tx] - zii_h)< 0.0 ){
Zos[tx]= -1.0*(Zos[tx] - zii_h); }
else{
Zos[tx]= (Zos[tx] - zii_h);
}

if( Xos[tx] > box_x - Xos[tx])
{
Xos[tx] = Xos[tx] - box_x ;
}
if( Yos[tx] > box_y - Yos[tx])
{
Yos[tx] = Yos[tx] - box_y ;
}
if( Zos[tx] > box_z - Zos[tx])
{
Zos[tx] = Zos[tx] - box_z ;
}

rij[tx] = Xos[tx]*Xos[tx] + Yos[tx]*Yos[tx] + Zos[tx]*Zos[tx];

// printf(“distance %f \n”,rij[tx]);
// printf(“Xos is %f \n”,Xos[tx]);

if(rij[tx] < rabvsq ){
count_s[tx] = 1 ;
nbor_s[tx] = 1+idx ;
}
else{
count_s[tx] = 0 ;
nbor_s[tx] = 0 ;
}

if( rij[tx] < rcutsq_ff && rij[tx] == 0.0f )
{
ee_thread[tx] = 0.0f;

} else if(rij[tx] < rcutsq_ff)
{
rij[tx] = (sigOO*sigOO)/(rij[tx]) ;
ee_thread[tx] = 4.0 * epsOO *(rij[tx]*rij[tx]*rij[tx]*rij[tx]*rij[tx]*rij[tx] - rij[tx]*rij[tx]*rij[tx]);

}
else{
ee_thread[tx] = 0.0f;
}

Xo_d[idx] = Xos[tx];
count_d[idx] = count_s[tx] ;
ee_d[idx] = ee_thread[tx] ;
nbor_d[idx] = nbor_s[tx];
}
#endif

Please solve this problem.I have edited the post to make it more clear to understand

How many threads per block do you launch?

I am launching 8 threads,and there is only one block in the grid,to keep things simple.

I solved the problem myself,it was a coding error