Monte Carlo simulations on GPU

I am trying to get speed up for my Monte Carlo molecular simulations on GPU.The strategy is as follows.I am trying to run energy calculations of my simulation on GPU.

  1. I load all molecule positions on the GPU
  2. Transfer all the positions from global memory to shared memory
  3. Each thread then calculates the interaction energy of two molecules,interaction of target molecule with other molecule
  4. Also it calculates the neighboring molecules around the target molecule
    If I have 2560 molecules,and I am running with 256 threads per block,the time taken by GPU is 0.226 ms.If I calculate the time taken by CPU to do the same calculation,it is 0.063 ms.That is CPU is faster.I think I am going wrong somewhere.Can you please suggest something?

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include<cuda.h>
#include<kernel.cu>
#include<cutil.h>

void host(float *Xo_h,float *Yo_h,float *Zo_h,float *ee_h,float xii_h,float yii_h,float zii_h,int mol,int *count_h,int *nbor_h);
int main()
{
float *Xo_h, *Yo_h, *Zo_h, *ee_h ;
int *count_h,*nbor_h;
int mol;

float xii_h, yii_h, zii_h ;

host(Xo_h, Yo_h, Zo_h, ee_h, xii_h, yii_h, zii_h, mol, count_h, nbor_h);
}

void host(float *Xo_h,float *Yo_h,float *Zo_h,float *ee_h,float xii_h,float yii_h,float zii_h,int mol,int *count_h,int *nbor_h)
{
int nBytes, i, N ,iBytes ;
float *Xo_d, *Yo_d, *Zo_d, *ee_d ;
int *count_d,*nbor_d;
float xii_d,yii_d,zii_d;

mol = 5;
N = 2560 ;

nBytes = Nsizeof(float);
iBytes = N
sizeof(int);

Xo_h = (float*)malloc(nBytes) ;
Yo_h = (float*)malloc(nBytes) ;
Zo_h = (float*)malloc(nBytes) ;
ee_h = (float*)malloc(nBytes) ;
count_h =(int*)malloc(iBytes) ;
nbor_h =(int*)malloc(iBytes) ;
//xii_h = (float*)malloc(sizeof(float)) ;
//yii_h = (float*)malloc(sizeof(float)) ;
// zii_h = (float*)malloc(sizeof(float)) ;

for ( i=0 ; i< N ; i++ )
{ Xo_h[i] = 0.3* (float)i ;
Yo_h[i] = 0.4*(float)i ;
Zo_h[i] = 0.5*(float)i ;
ee_h[i] = 0.0;
count_h[i] = 0;
nbor_h[i] = 0;
}
xii_d = Xo_h[5];
yii_d = Yo_h[5];
zii_d = Zo_h[5];
FILE *fp ;
fp = fopen(“result1.dat” ,“w”) ;
for(i=0;i<N;i++)
{ fprintf(fp,“ee equals %f \n” ,ee_h[i]) ;
fprintf(fp,“Xo_h equals %f \n” ,Xo_h[i]) ;
fprintf(fp,“count_h equals %i \n” ,count_h[i]) ;
}

cudaMalloc((void**)&Xo_d ,nBytes);
cudaMalloc((void**)&Yo_d ,nBytes);
cudaMalloc((void**)&Zo_d ,nBytes);
cudaMalloc((void**)&ee_d ,nBytes);
cudaMalloc((void**)&count_d ,iBytes);
cudaMalloc((void**)&nbor_d,iBytes);
//cudaMalloc((void**)&xii_d ,sizeof(float));
//cudaMalloc((void**)&yii_d ,sizeof(float));
//cudaMalloc((void**)&zii_d ,sizeof(float));

cudaMemcpy(Xo_d,Xo_h,nBytes,cudaMemcpyHostToDevice);
cudaMemcpy(Yo_d,Yo_h,nBytes,cudaMemcpyHostToDevice);
cudaMemcpy(Zo_d,Zo_h,nBytes,cudaMemcpyHostToDevice);
cudaMemcpy(ee_d,ee_h,nBytes,cudaMemcpyHostToDevice);
cudaMemcpy(count_d,count_h,nBytes,cudaMemcpyHostToDevice);
cudaMemcpy(nbor_d,nbor_h,nBytes,cudaMemcpyHostToDevice);
//cudaMemcpy(xii_d,xii_h,sizeof(float),cudaMemcpyHostToDevice)
;
//cudaMemcpy(yii_d,yii_h,sizeof(float),cudaMemcpyHostToDevice)
;
//cudaMemcpy(zii_d,zii_h,sizeof(float),cudaMemcpyHostToDevice)
;

// create and start timer
unsigned int timer = 0;
cutCreateTimer(&timer);
cutStartTimer(timer);

/* Define grid and block size */

// Launch kernel

dim3 dimGrid(10,1,1);
dim3 dimBlock(256,1,1) ;
ljondevice <<< dimGrid ,dimBlock >>> (Xo_d ,Yo_d ,Zo_d , ee_d ,xii_d,yii_d,zii_d,mol,count_d,nbor_d);

CUT_CHECK_ERROR(“Kernel execution failed”);

cudaMemcpy( ee_h,ee_d ,nBytes,cudaMemcpyDeviceToHost);
cudaMemcpy( Xo_h,Xo_d ,nBytes,cudaMemcpyDeviceToHost);
cudaMemcpy( count_h,count_d ,iBytes,cudaMemcpyDeviceToHost);
cudaMemcpy( nbor_h,nbor_d ,iBytes,cudaMemcpyDeviceToHost);
// stop and destroy timer
cutStopTimer(timer);
printf(“Processing time: %f (ms) \n”, cutGetTimerValue(timer));
cutDeleteTimer(timer);

FILE *gp;
gp = fopen(“result2.dat” ,“w”) ;
for(i=0;i<N;i++)
{ fprintf(gp,“ee equals %f \n” ,ee_h[i]) ;
fprintf(gp,“Xo_h equals %f \n” ,Xo_h[i]) ;
fprintf(gp,“count_h equals %i \n” ,count_h[i]) ;
fprintf(gp,“nbor_h equals %i \n” ,nbor_h[i]) ;
}

free(Xo_h);
free(Yo_h);
free(Zo_h);
free(ee_h);
free(count_h);
free(nbor_h);

cudaFree(Xo_d);
cudaFree(Yo_d);
cudaFree(Zo_d);
cudaFree(ee_d);
cudaFree(count_d);
cudaFree(nbor_d);

}

#ifndef KERNEL_H
#define KERNEL_H

#include <stdio.h>

void checkCUDAError(const char* msg);

global void ljondevice(float *Xo_d,float *Yo_d,float *Zo_d,float *ee_d,float xii_h,float yii_h,float zii_h, int mol,int *count_d,int *nbor_d )

{
float sigOO = 3.166f ;
float epsOO = 78.202f ;
float box_x = 36.182f ;
float box_y = 36.182f ;
float box_z = 150.0f ;
float rcutsq_ff = 100.0f ;
float rabvsq = 12.25f ;
// float rovlpsq = 4.0f ;
// float ee_thread,rij ;
//float xii = Xo_d[mol] ;
//float yii = Yo_d[mol] ;
//float zii = Zo_d[mol] ;

device shared float Xos[256] ;
device shared float Yos[256] ;
device shared float Zos[256] ;
device shared int count_s[256] ;
device shared int nbor_s[256] ;
device shared float rij[256] ;
device shared float ee_thread[256] ;

int idx = blockIdx.x*blockDim.x + threadIdx.x ;

// int idy = blockIdx.yblockDim.y + threadIdx.y ;
// int idz = blockIdx.z
blockDim.z + threadIdx.z ;

int tx = threadIdx.x ;
// int ty = threadIdx.y ;
// int tz = threadIdx.z ;

Xos[tx] = Xo_d[idx] ;
Yos[tx] = Yo_d[idx] ;
Zos[tx] = Zo_d[idx] ;
count_s[tx] = count_d[idx] ;
nbor_s[tx] = nbor_d[idx] ;

//printf(“Xos is %f \n”,Xos[tx]);

//printf(“theard x is %i \n”,tx);
// printf(“theard y is %i \n”,ty);
// printf(“theard z is %i \n”,tz);
__syncthreads() ;

if ((Xos[tx] - xii_h)< 0.0 )
{ Xos[tx]= -1.0*(Xos[tx] - xii_h);
}else{
Xos[tx]= (Xos[tx] - xii_h);
}
if ((Yos[tx] - yii_h)< 0.0 ){
Yos[tx]= -1.0*(Yos[tx] - yii_h);}
else{
Yos[tx]= (Yos[tx] - yii_h);
}

if ((Zos[tx] - zii_h)< 0.0 ){
Zos[tx]= -1.0*(Zos[tx] - zii_h); }
else{
Zos[tx]= (Zos[tx] - zii_h);
}

if( Xos[tx] > box_x - Xos[tx])
{
Xos[tx] = Xos[tx] - box_x ;
}
if( Yos[tx] > box_y - Yos[tx])
{
Yos[tx] = Yos[tx] - box_y ;
}
if( Zos[tx] > box_z - Zos[tx])
{
Zos[tx] = Zos[tx] - box_z ;
}

rij[tx] = Xos[tx]*Xos[tx] + Yos[tx]*Yos[tx] + Zos[tx]*Zos[tx];

// printf(“distance %f \n”,rij[tx]);
// printf(“Xos is %f \n”,Xos[tx]);

if(rij[tx] < rabvsq ){
count_s[tx] = 1 ;
nbor_s[tx] = 1+idx ;
}
else{
count_s[tx] = 0 ;
nbor_s[tx] = 0 ;
}

if( rij[tx] < rcutsq_ff && rij[tx] == 0.0f )
{
ee_thread[tx] = 0.0f;

} else if(rij[tx] < rcutsq_ff)
{
rij[tx] = (sigOO*sigOO)/(rij[tx]) ;
ee_thread[tx] = 4.0 * epsOO *(rij[tx]*rij[tx]*rij[tx]*rij[tx]*rij[tx]*rij[tx] - rij[tx]*rij[tx]*rij[tx]);

}
else{
      ee_thread[tx] = 0.0f;

}
__syncthreads() ;

//printf(“rabvsq %f \n”,rabvsq);
// printf(“count is %i \n”,count_s[tx]);
// printf(“nbor is %i \n”,nbor_s[tx]);
// printf(“ee %f \n”,ee_thread[tx]);
Xo_d[idx] = Xos[tx];
count_d[idx] = count_s[tx] ;
ee_d[idx] = ee_thread[tx] ;
nbor_d[idx] = nbor_s[tx];
}
/*void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, “Cuda error: %s: %s.\n”, msg,
cudaGetErrorString( err) );
exit(EXIT_FAILURE);
} */
#endif

I take Tesla C1060 as an example

(1) resource usage of your kernel is

lmem = 0

smem = 7248

reg  = 13 

occupancy is 50 % , say 512 threads per SM

this number (>192 threads ) is good, it can hide pipeline latency.

(2) from your code, global R/W is coalesced and no bank-conflict in shared memory,

however you have only activate 10 SM (TeslaC1060 has 30 SM).

this means that your problem under-utilize GPU power.

you can try larger dataset for example, N = 256 * 200 such that you can use all resource in GPU

(3) in fact, you don’t need shared memory, I re-write your code but use register

__global__ void ljondevice(float *Xo_d,float *Yo_d,float *Zo_d,float *ee_d,float xii_h,float yii_h,float zii_h, 

			int mol,int *count_d,int *nbor_d )

{ 

	float sigOO = 3.166f;

	float epsOO = 78.202f;

	float box_x = 36.182f;

	float box_y = 36.182f;

	float box_z = 150.0f;

	float rcutsq_ff = 100.0f;

	float rabvsq = 12.25f;

	

	float Xos_tx;

	float Yos_tx;

	float Zos_tx;

	float count_s_tx;

	int nbor_s_tx;

	float rij_tx;

	float ee_thread_tx;

	int tx = threadIdx.x;

	int idx = blockIdx.x*blockDim.x + threadIdx.x;

	Xos_tx = Xo_d[idx];

	Yos_tx = Yo_d[idx];

	Zos_tx = Zo_d[idx];

	count_s_tx = count_d[idx];

	nbor_s_tx = nbor_d[idx];

//	__syncthreads();

	Xos_tx -= xii_h;

	if ( Xos_tx < 0.0f ){ Xos_tx *= -1.0f; }

	

	Yos_tx -= yii_h;

	if ( Yos_tx < 0.0f ){ Yos_tx *= -1.0f; } 

	Zos_tx -= zii_h;

	if ( Zos_tx < 0.0f ){ Zos_tx *= -1.0f; }

	

	if( Xos_tx > box_x - Xos_tx){

		Xos_tx -= box_x;

	}

	if( Yos_tx > box_y - Yos_tx){

		Yos_tx -= box_y;

	}

	if( Zos_tx > box_z - Zos_tx){

		Zos_tx -= box_z;

	}

	rij_tx = Xos_tx*Xos_tx + Yos_tx*Yos_tx + Zos_tx*Zos_tx;

	if( rij_tx < rabvsq ){

		count_s_tx = 1;

		nbor_s_tx = 1+idx;

	}else{

		count_s_tx = 0;

		nbor_s_tx  = 0;

	}

	if( rij_tx < rcutsq_ff && rij_tx == 0.0f ){ 

		ee_thread_tx = 0.0f; 

	}else if( rij_tx < rcutsq_ff ){

		rij_tx = (sigOO*sigOO)/(rij_tx);

		

		float rij_cube = rij_tx * rij_tx  * rij_tx ;

		ee_thread_tx = 4.0 * epsOO * rij_cube *( rij_cube - 1.0f );

	}else{

		ee_thread_tx = 0.0f;

	}

	

//	__syncthreads();

	Xo_d[idx] 		= Xos_tx;

	count_d[idx] 	= count_s_tx;

	ee_d[idx] 		= ee_thread_tx;

	nbor_d[idx] 	= nbor_s_tx; 

}

above kernel uses resource

lmem = 0 

smem = 80

reg  = 12

occuapncy is 100 %

However both kernel are of no difference in performance

shared memory based: 0.098591 (ms)

register based: 0.096469 (ms)

so my suggestion is

try larger dataset to take advantage of

(1) 10GB/s bandwidth of TeslaC0160 and

(2) 900 Gflop of Telsla C1060

your problem N = 2560 is very small, I think that all your data can be fited into L2 cache (4MB or 6MB for quadcore),

that is why CPU has good performance.

Thanks a ton for your detailed reply.I will try out your valuable suggestion.

Thanks again

Hi , we do our simulations for a particle number ranging from 2000 - 6000.I can’t increase my dataset to a very large number.Is there still a way out,to get the benefit of a GPU for my simulations.

your timing measurement (code below) is not proper,

// create and start timer

unsigned int timer = 0;

cutCreateTimer(&timer);

cutStartTimer(timer);

dim3 dimGrid(10,1,1);

dim3 dimBlock(256,1,1);

ljondevice <<< dimGrid ,dimBlock >>> (Xo_d ,Yo_d ,Zo_d , ee_d ,xii_d,yii_d,zii_d,mol,count_d,nbor_d);

CUT_CHECK_ERROR("Kernel execution failed");

cudaMemcpy( ee_h,ee_d ,nBytes,cudaMemcpyDeviceToHost);

cudaMemcpy( Xo_h,Xo_d ,nBytes,cudaMemcpyDeviceToHost);

cudaMemcpy( count_h,count_d ,iBytes,cudaMemcpyDeviceToHost);

cudaMemcpy( nbor_h,nbor_d ,iBytes,cudaMemcpyDeviceToHost);

// stop and destroy timer

cutStopTimer(timer);

since you measure kernel time and data transafer together.

your kernel is memory-bound ( float-operation / (memory R/W) is low ) and

Tesla C1060 has effective bandwidth 80 GB/s (theorentical bandwidth is 100GB/s).

However PCI express Gen2 has 2~3 GB/s for non-pinned memory transfer, your timing

is dominanted by PCI transfer.

I suggest that you shoudl measure performance of kernel first

dim3 dimGrid(10,1,1);

dim3 dimBlock(256,1,1);

	int maxite  = 1000; 

// create and start timer

	unsigned int timer = 0;

	cutCreateTimer(&timer);

	cutStartTimer(timer);

// Launch kernel

	for(int i = 0; i < maxite; i++){

		ljondevice <<< dimGrid ,dimBlock >>> (Xo_d ,Yo_d ,Zo_d , ee_d ,xii_d,yii_d,zii_d,mol,count_d,nbor_d);

	}		   

	CUT_CHECK_ERROR("Kernel execution failed");

	cudaThreadSynchronize();

// stop and destroy timer

	cutStopTimer(timer);

	printf("Processing time: %f (ms) \n", cutGetTimerValue(timer)/((double)maxite));

	cutDeleteTimer(timer);

second, in your kernel, global read/write

// read 

	Xos[tx] = Xo_d[idx];   // float *Xo_d

	Yos[tx] = Yo_d[idx];   // float *Yo_d 

	Zos[tx] = Zo_d[idx];   // float *Zo_d

	count_s[tx] = count_d[idx]; // int *count_d

	nbor_s[tx] = nbor_d[idx]; // int *nbor_d 

// write

	Xo_d[idx] = Xos[tx];  // float *Xo_d

	count_d[idx] = count_s[tx]; // int *count_d

	ee_d[idx] = ee_thread[tx]; // float *ee_d

	nbor_d[idx] = nbor_s[tx]; // 	int *nbor_d

total transfer: 5N float and 4N int (or 36*N bytes )

and we measure bandwidth = 36N / (kernel time)

case 1 (share memory-based): N = 256*10, grid = 10, thread block = 256

time = 0.009768 (ms), bandwidth = 8.787183 (GB/s)

case 2 (share memory-based): N = 256 * 20, grid = 20, thread block = 256

time = 0.009770 (ms), bandwidth = 17.569387 (GB/s)

case 3 (share memory-based): N = 256 * 30, grid = 30, thread block = 256

time = 0.010152 (ms), bandwidth = 25.362903 (GB/s)

case 4 (share memory-based): N = 256 * 300, grid = 300, thread block = 256

time = 0.056808 (ms), bandwidth = 45.326916 (GB/s)

(register-based), time = 0.030667 (ms), bandwidth = 83.962978 (GB/s)

case 3 and case 4 fully utilize all SM, so its performance is very good, especially

when using register-based, case 4 reaches maximum bandwidth (kernel is memory-bound)

Thank you for your reply.I will try to see the performance