I am trying to get speed up for my Monte Carlo molecular simulations on GPU.The strategy is as follows.I am trying to run energy calculations of my simulation on GPU.
- I load all molecule positions on the GPU
- Transfer all the positions from global memory to shared memory
- Each thread then calculates the interaction energy of two molecules,interaction of target molecule with other molecule
- Also it calculates the neighboring molecules around the target molecule
If I have 2560 molecules,and I am running with 256 threads per block,the time taken by GPU is 0.226 ms.If I calculate the time taken by CPU to do the same calculation,it is 0.063 ms.That is CPU is faster.I think I am going wrong somewhere.Can you please suggest something?
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include<cuda.h>
#include<kernel.cu>
#include<cutil.h>
void host(float *Xo_h,float *Yo_h,float *Zo_h,float *ee_h,float xii_h,float yii_h,float zii_h,int mol,int *count_h,int *nbor_h);
int main()
{
float *Xo_h, *Yo_h, *Zo_h, *ee_h ;
int *count_h,*nbor_h;
int mol;
float xii_h, yii_h, zii_h ;
host(Xo_h, Yo_h, Zo_h, ee_h, xii_h, yii_h, zii_h, mol, count_h, nbor_h);
}
void host(float *Xo_h,float *Yo_h,float *Zo_h,float *ee_h,float xii_h,float yii_h,float zii_h,int mol,int *count_h,int *nbor_h)
{
int nBytes, i, N ,iBytes ;
float *Xo_d, *Yo_d, *Zo_d, *ee_d ;
int *count_d,*nbor_d;
float xii_d,yii_d,zii_d;
mol = 5;
N = 2560 ;
nBytes = Nsizeof(float);
iBytes = Nsizeof(int);
Xo_h = (float*)malloc(nBytes) ;
Yo_h = (float*)malloc(nBytes) ;
Zo_h = (float*)malloc(nBytes) ;
ee_h = (float*)malloc(nBytes) ;
count_h =(int*)malloc(iBytes) ;
nbor_h =(int*)malloc(iBytes) ;
//xii_h = (float*)malloc(sizeof(float)) ;
//yii_h = (float*)malloc(sizeof(float)) ;
// zii_h = (float*)malloc(sizeof(float)) ;
for ( i=0 ; i< N ; i++ )
{ Xo_h[i] = 0.3* (float)i ;
Yo_h[i] = 0.4*(float)i ;
Zo_h[i] = 0.5*(float)i ;
ee_h[i] = 0.0;
count_h[i] = 0;
nbor_h[i] = 0;
}
xii_d = Xo_h[5];
yii_d = Yo_h[5];
zii_d = Zo_h[5];
FILE *fp ;
fp = fopen(“result1.dat” ,“w”) ;
for(i=0;i<N;i++)
{ fprintf(fp,“ee equals %f \n” ,ee_h[i]) ;
fprintf(fp,“Xo_h equals %f \n” ,Xo_h[i]) ;
fprintf(fp,“count_h equals %i \n” ,count_h[i]) ;
}
cudaMalloc((void**)&Xo_d ,nBytes);
cudaMalloc((void**)&Yo_d ,nBytes);
cudaMalloc((void**)&Zo_d ,nBytes);
cudaMalloc((void**)&ee_d ,nBytes);
cudaMalloc((void**)&count_d ,iBytes);
cudaMalloc((void**)&nbor_d,iBytes);
//cudaMalloc((void**)&xii_d ,sizeof(float));
//cudaMalloc((void**)&yii_d ,sizeof(float));
//cudaMalloc((void**)&zii_d ,sizeof(float));
cudaMemcpy(Xo_d,Xo_h,nBytes,cudaMemcpyHostToDevice);
cudaMemcpy(Yo_d,Yo_h,nBytes,cudaMemcpyHostToDevice);
cudaMemcpy(Zo_d,Zo_h,nBytes,cudaMemcpyHostToDevice);
cudaMemcpy(ee_d,ee_h,nBytes,cudaMemcpyHostToDevice);
cudaMemcpy(count_d,count_h,nBytes,cudaMemcpyHostToDevice);
cudaMemcpy(nbor_d,nbor_h,nBytes,cudaMemcpyHostToDevice);
//cudaMemcpy(xii_d,xii_h,sizeof(float),cudaMemcpyHostToDevice)
;
//cudaMemcpy(yii_d,yii_h,sizeof(float),cudaMemcpyHostToDevice)
;
//cudaMemcpy(zii_d,zii_h,sizeof(float),cudaMemcpyHostToDevice)
;
// create and start timer
unsigned int timer = 0;
cutCreateTimer(&timer);
cutStartTimer(timer);
/* Define grid and block size */
// Launch kernel
dim3 dimGrid(10,1,1);
dim3 dimBlock(256,1,1) ;
ljondevice <<< dimGrid ,dimBlock >>> (Xo_d ,Yo_d ,Zo_d , ee_d ,xii_d,yii_d,zii_d,mol,count_d,nbor_d);
CUT_CHECK_ERROR(“Kernel execution failed”);
cudaMemcpy( ee_h,ee_d ,nBytes,cudaMemcpyDeviceToHost);
cudaMemcpy( Xo_h,Xo_d ,nBytes,cudaMemcpyDeviceToHost);
cudaMemcpy( count_h,count_d ,iBytes,cudaMemcpyDeviceToHost);
cudaMemcpy( nbor_h,nbor_d ,iBytes,cudaMemcpyDeviceToHost);
// stop and destroy timer
cutStopTimer(timer);
printf(“Processing time: %f (ms) \n”, cutGetTimerValue(timer));
cutDeleteTimer(timer);
FILE *gp;
gp = fopen(“result2.dat” ,“w”) ;
for(i=0;i<N;i++)
{ fprintf(gp,“ee equals %f \n” ,ee_h[i]) ;
fprintf(gp,“Xo_h equals %f \n” ,Xo_h[i]) ;
fprintf(gp,“count_h equals %i \n” ,count_h[i]) ;
fprintf(gp,“nbor_h equals %i \n” ,nbor_h[i]) ;
}
free(Xo_h);
free(Yo_h);
free(Zo_h);
free(ee_h);
free(count_h);
free(nbor_h);
cudaFree(Xo_d);
cudaFree(Yo_d);
cudaFree(Zo_d);
cudaFree(ee_d);
cudaFree(count_d);
cudaFree(nbor_d);
}
#ifndef KERNEL_H
#define KERNEL_H
#include <stdio.h>
void checkCUDAError(const char* msg);
global void ljondevice(float *Xo_d,float *Yo_d,float *Zo_d,float *ee_d,float xii_h,float yii_h,float zii_h, int mol,int *count_d,int *nbor_d )
{
float sigOO = 3.166f ;
float epsOO = 78.202f ;
float box_x = 36.182f ;
float box_y = 36.182f ;
float box_z = 150.0f ;
float rcutsq_ff = 100.0f ;
float rabvsq = 12.25f ;
// float rovlpsq = 4.0f ;
// float ee_thread,rij ;
//float xii = Xo_d[mol] ;
//float yii = Yo_d[mol] ;
//float zii = Zo_d[mol] ;
device shared float Xos[256] ;
device shared float Yos[256] ;
device shared float Zos[256] ;
device shared int count_s[256] ;
device shared int nbor_s[256] ;
device shared float rij[256] ;
device shared float ee_thread[256] ;
int idx = blockIdx.x*blockDim.x + threadIdx.x ;
// int idy = blockIdx.yblockDim.y + threadIdx.y ;
// int idz = blockIdx.zblockDim.z + threadIdx.z ;
int tx = threadIdx.x ;
// int ty = threadIdx.y ;
// int tz = threadIdx.z ;
Xos[tx] = Xo_d[idx] ;
Yos[tx] = Yo_d[idx] ;
Zos[tx] = Zo_d[idx] ;
count_s[tx] = count_d[idx] ;
nbor_s[tx] = nbor_d[idx] ;
//printf(“Xos is %f \n”,Xos[tx]);
//printf(“theard x is %i \n”,tx);
// printf(“theard y is %i \n”,ty);
// printf(“theard z is %i \n”,tz);
__syncthreads() ;
if ((Xos[tx] - xii_h)< 0.0 )
{ Xos[tx]= -1.0*(Xos[tx] - xii_h);
}else{
Xos[tx]= (Xos[tx] - xii_h);
}
if ((Yos[tx] - yii_h)< 0.0 ){
Yos[tx]= -1.0*(Yos[tx] - yii_h);}
else{
Yos[tx]= (Yos[tx] - yii_h);
}
if ((Zos[tx] - zii_h)< 0.0 ){
Zos[tx]= -1.0*(Zos[tx] - zii_h); }
else{
Zos[tx]= (Zos[tx] - zii_h);
}
if( Xos[tx] > box_x - Xos[tx])
{
Xos[tx] = Xos[tx] - box_x ;
}
if( Yos[tx] > box_y - Yos[tx])
{
Yos[tx] = Yos[tx] - box_y ;
}
if( Zos[tx] > box_z - Zos[tx])
{
Zos[tx] = Zos[tx] - box_z ;
}
rij[tx] = Xos[tx]*Xos[tx] + Yos[tx]*Yos[tx] + Zos[tx]*Zos[tx];
// printf(“distance %f \n”,rij[tx]);
// printf(“Xos is %f \n”,Xos[tx]);
if(rij[tx] < rabvsq ){
count_s[tx] = 1 ;
nbor_s[tx] = 1+idx ;
}
else{
count_s[tx] = 0 ;
nbor_s[tx] = 0 ;
}
if( rij[tx] < rcutsq_ff && rij[tx] == 0.0f )
{
ee_thread[tx] = 0.0f;
} else if(rij[tx] < rcutsq_ff)
{
rij[tx] = (sigOO*sigOO)/(rij[tx]) ;
ee_thread[tx] = 4.0 * epsOO *(rij[tx]*rij[tx]*rij[tx]*rij[tx]*rij[tx]*rij[tx] - rij[tx]*rij[tx]*rij[tx]);
}
else{
ee_thread[tx] = 0.0f;
}
__syncthreads() ;
//printf(“rabvsq %f \n”,rabvsq);
// printf(“count is %i \n”,count_s[tx]);
// printf(“nbor is %i \n”,nbor_s[tx]);
// printf(“ee %f \n”,ee_thread[tx]);
Xo_d[idx] = Xos[tx];
count_d[idx] = count_s[tx] ;
ee_d[idx] = ee_thread[tx] ;
nbor_d[idx] = nbor_s[tx];
}
/*void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, “Cuda error: %s: %s.\n”, msg,
cudaGetErrorString( err) );
exit(EXIT_FAILURE);
} */
#endif