Monte Carlo simulations on GPU

NAMD · November 24, 2009, 1:37am

I am trying to get speed up for my Monte Carlo molecular simulations on GPU.The strategy is as follows.I am trying to run energy calculations of my simulation on GPU.

I load all molecule positions on the GPU
Transfer all the positions from global memory to shared memory
Each thread then calculates the interaction energy of two molecules,interaction of target molecule with other molecule
Also it calculates the neighboring molecules around the target molecule
If I have 2560 molecules,and I am running with 256 threads per block,the time taken by GPU is 0.226 ms.If I calculate the time taken by CPU to do the same calculation,it is 0.063 ms.That is CPU is faster.I think I am going wrong somewhere.Can you please suggest something?

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include<cuda.h>
#include<kernel.cu>
#include<cutil.h>

void host(float *Xo_h,float *Yo_h,float *Zo_h,float *ee_h,float xii_h,float yii_h,float zii_h,int mol,int *count_h,int *nbor_h);
int main()
{
float *Xo_h, *Yo_h, *Zo_h, *ee_h ;
int *count_h,*nbor_h;
int mol;

float xii_h, yii_h, zii_h ;

host(Xo_h, Yo_h, Zo_h, ee_h, xii_h, yii_h, zii_h, mol, count_h, nbor_h);
}

void host(float *Xo_h,float *Yo_h,float *Zo_h,float *ee_h,float xii_h,float yii_h,float zii_h,int mol,int *count_h,int *nbor_h)
{
int nBytes, i, N ,iBytes ;
float *Xo_d, *Yo_d, *Zo_d, *ee_d ;
int *count_d,*nbor_d;
float xii_d,yii_d,zii_d;

mol = 5;
N = 2560 ;

nBytes = Nsizeof(float);
iBytes = Nsizeof(int);

Xo_h = (float*)malloc(nBytes) ;
Yo_h = (float*)malloc(nBytes) ;
Zo_h = (float*)malloc(nBytes) ;
ee_h = (float*)malloc(nBytes) ;
count_h =(int*)malloc(iBytes) ;
nbor_h =(int*)malloc(iBytes) ;
//xii_h = (float*)malloc(sizeof(float)) ;
//yii_h = (float*)malloc(sizeof(float)) ;
// zii_h = (float*)malloc(sizeof(float)) ;

for ( i=0 ; i< N ; i++ )
{ Xo_h[i] = 0.3* (float)i ;
Yo_h[i] = 0.4*(float)i ;
Zo_h[i] = 0.5*(float)i ;
ee_h[i] = 0.0;
count_h[i] = 0;
nbor_h[i] = 0;
}
xii_d = Xo_h[5];
yii_d = Yo_h[5];
zii_d = Zo_h[5];
FILE *fp ;
fp = fopen(“result1.dat” ,“w”) ;
for(i=0;i<N;i++)
{ fprintf(fp,“ee equals %f \n” ,ee_h[i]) ;
fprintf(fp,“Xo_h equals %f \n” ,Xo_h[i]) ;
fprintf(fp,“count_h equals %i \n” ,count_h[i]) ;
}

cudaMalloc((void**)&Xo_d ,nBytes);
cudaMalloc((void**)&Yo_d ,nBytes);
cudaMalloc((void**)&Zo_d ,nBytes);
cudaMalloc((void**)&ee_d ,nBytes);
cudaMalloc((void**)&count_d ,iBytes);
cudaMalloc((void**)&nbor_d,iBytes);
//cudaMalloc((void**)&xii_d ,sizeof(float));
//cudaMalloc((void**)&yii_d ,sizeof(float));
//cudaMalloc((void**)&zii_d ,sizeof(float));

cudaMemcpy(Xo_d,Xo_h,nBytes,cudaMemcpyHostToDevice);
cudaMemcpy(Yo_d,Yo_h,nBytes,cudaMemcpyHostToDevice);
cudaMemcpy(Zo_d,Zo_h,nBytes,cudaMemcpyHostToDevice);
cudaMemcpy(ee_d,ee_h,nBytes,cudaMemcpyHostToDevice);
cudaMemcpy(count_d,count_h,nBytes,cudaMemcpyHostToDevice);
cudaMemcpy(nbor_d,nbor_h,nBytes,cudaMemcpyHostToDevice);
//cudaMemcpy(xii_d,xii_h,sizeof(float),cudaMemcpyHostToDevice)
;
//cudaMemcpy(yii_d,yii_h,sizeof(float),cudaMemcpyHostToDevice)
;
//cudaMemcpy(zii_d,zii_h,sizeof(float),cudaMemcpyHostToDevice)
;

// create and start timer
unsigned int timer = 0;
cutCreateTimer(&timer);
cutStartTimer(timer);

/* Define grid and block size */

// Launch kernel

dim3 dimGrid(10,1,1);
dim3 dimBlock(256,1,1) ;
ljondevice <<< dimGrid ,dimBlock >>> (Xo_d ,Yo_d ,Zo_d , ee_d ,xii_d,yii_d,zii_d,mol,count_d,nbor_d);

CUT_CHECK_ERROR(“Kernel execution failed”);

cudaMemcpy( ee_h,ee_d ,nBytes,cudaMemcpyDeviceToHost);
cudaMemcpy( Xo_h,Xo_d ,nBytes,cudaMemcpyDeviceToHost);
cudaMemcpy( count_h,count_d ,iBytes,cudaMemcpyDeviceToHost);
cudaMemcpy( nbor_h,nbor_d ,iBytes,cudaMemcpyDeviceToHost);
// stop and destroy timer
cutStopTimer(timer);
printf(“Processing time: %f (ms) \n”, cutGetTimerValue(timer));
cutDeleteTimer(timer);

FILE *gp;
gp = fopen(“result2.dat” ,“w”) ;
for(i=0;i<N;i++)
{ fprintf(gp,“ee equals %f \n” ,ee_h[i]) ;
fprintf(gp,“Xo_h equals %f \n” ,Xo_h[i]) ;
fprintf(gp,“count_h equals %i \n” ,count_h[i]) ;
fprintf(gp,“nbor_h equals %i \n” ,nbor_h[i]) ;
}

free(Xo_h);
free(Yo_h);
free(Zo_h);
free(ee_h);
free(count_h);
free(nbor_h);

cudaFree(Xo_d);
cudaFree(Yo_d);
cudaFree(Zo_d);
cudaFree(ee_d);
cudaFree(count_d);
cudaFree(nbor_d);

}

#ifndef KERNEL_H
#define KERNEL_H

#include <stdio.h>

void checkCUDAError(const char* msg);

global void ljondevice(float *Xo_d,float *Yo_d,float *Zo_d,float *ee_d,float xii_h,float yii_h,float zii_h, int mol,int *count_d,int *nbor_d )

{
float sigOO = 3.166f ;
float epsOO = 78.202f ;
float box_x = 36.182f ;
float box_y = 36.182f ;
float box_z = 150.0f ;
float rcutsq_ff = 100.0f ;
float rabvsq = 12.25f ;
// float rovlpsq = 4.0f ;
// float ee_thread,rij ;
//float xii = Xo_d[mol] ;
//float yii = Yo_d[mol] ;
//float zii = Zo_d[mol] ;

device shared float Xos[256] ;
device shared float Yos[256] ;
device shared float Zos[256] ;
device shared int count_s[256] ;
device shared int nbor_s[256] ;
device shared float rij[256] ;
device shared float ee_thread[256] ;

int idx = blockIdx.x*blockDim.x + threadIdx.x ;

// int idy = blockIdx.yblockDim.y + threadIdx.y ;
// int idz = blockIdx.zblockDim.z + threadIdx.z ;

int tx = threadIdx.x ;
// int ty = threadIdx.y ;
// int tz = threadIdx.z ;

Xos[tx] = Xo_d[idx] ;
Yos[tx] = Yo_d[idx] ;
Zos[tx] = Zo_d[idx] ;
count_s[tx] = count_d[idx] ;
nbor_s[tx] = nbor_d[idx] ;

//printf(“Xos is %f \n”,Xos[tx]);

//printf(“theard x is %i \n”,tx);
// printf(“theard y is %i \n”,ty);
// printf(“theard z is %i \n”,tz);
__syncthreads() ;

if ((Xos[tx] - xii_h)< 0.0 )
{ Xos[tx]= -1.0*(Xos[tx] - xii_h);
}else{
Xos[tx]= (Xos[tx] - xii_h);
}
if ((Yos[tx] - yii_h)< 0.0 ){
Yos[tx]= -1.0*(Yos[tx] - yii_h);}
else{
Yos[tx]= (Yos[tx] - yii_h);
}

if ((Zos[tx] - zii_h)< 0.0 ){
Zos[tx]= -1.0*(Zos[tx] - zii_h); }
else{
Zos[tx]= (Zos[tx] - zii_h);
}

if( Xos[tx] > box_x - Xos[tx])
{
Xos[tx] = Xos[tx] - box_x ;
}
if( Yos[tx] > box_y - Yos[tx])
{
Yos[tx] = Yos[tx] - box_y ;
}
if( Zos[tx] > box_z - Zos[tx])
{
Zos[tx] = Zos[tx] - box_z ;
}

rij[tx] = Xos[tx]*Xos[tx] + Yos[tx]*Yos[tx] + Zos[tx]*Zos[tx];

// printf(“distance %f \n”,rij[tx]);
// printf(“Xos is %f \n”,Xos[tx]);

if(rij[tx] < rabvsq ){
count_s[tx] = 1 ;
nbor_s[tx] = 1+idx ;
}
else{
count_s[tx] = 0 ;
nbor_s[tx] = 0 ;
}

if( rij[tx] < rcutsq_ff && rij[tx] == 0.0f )
{
ee_thread[tx] = 0.0f;

} else if(rij[tx] < rcutsq_ff)
{
rij[tx] = (sigOO*sigOO)/(rij[tx]) ;
ee_thread[tx] = 4.0 * epsOO *(rij[tx]*rij[tx]*rij[tx]*rij[tx]*rij[tx]*rij[tx] - rij[tx]*rij[tx]*rij[tx]);

}
else{
      ee_thread[tx] = 0.0f;

}
__syncthreads() ;

//printf(“rabvsq %f \n”,rabvsq);
// printf(“count is %i \n”,count_s[tx]);
// printf(“nbor is %i \n”,nbor_s[tx]);
// printf(“ee %f \n”,ee_thread[tx]);
Xo_d[idx] = Xos[tx];
count_d[idx] = count_s[tx] ;
ee_d[idx] = ee_thread[tx] ;
nbor_d[idx] = nbor_s[tx];
}
/*void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, “Cuda error: %s: %s.\n”, msg,
cudaGetErrorString( err) );
exit(EXIT_FAILURE);
} */
#endif

LSChien · November 24, 2009, 3:11am

I take Tesla C1060 as an example

(1) resource usage of your kernel is

lmem = 0

smem = 7248

reg  = 13

occupancy is 50 % , say 512 threads per SM

this number (>192 threads ) is good, it can hide pipeline latency.

(2) from your code, global R/W is coalesced and no bank-conflict in shared memory,

however you have only activate 10 SM (TeslaC1060 has 30 SM).

this means that your problem under-utilize GPU power.

you can try larger dataset for example, N = 256 * 200 such that you can use all resource in GPU

(3) in fact, you don’t need shared memory, I re-write your code but use register

__global__ void ljondevice(float *Xo_d,float *Yo_d,float *Zo_d,float *ee_d,float xii_h,float yii_h,float zii_h, 

			int mol,int *count_d,int *nbor_d )

{ 

	float sigOO = 3.166f;

	float epsOO = 78.202f;

	float box_x = 36.182f;

	float box_y = 36.182f;

	float box_z = 150.0f;

	float rcutsq_ff = 100.0f;

	float rabvsq = 12.25f;

	

	float Xos_tx;

	float Yos_tx;

	float Zos_tx;

	float count_s_tx;

	int nbor_s_tx;

	float rij_tx;

	float ee_thread_tx;

	int tx = threadIdx.x;

	int idx = blockIdx.x*blockDim.x + threadIdx.x;

	Xos_tx = Xo_d[idx];

	Yos_tx = Yo_d[idx];

	Zos_tx = Zo_d[idx];

	count_s_tx = count_d[idx];

	nbor_s_tx = nbor_d[idx];

//	__syncthreads();

	Xos_tx -= xii_h;

	if ( Xos_tx < 0.0f ){ Xos_tx *= -1.0f; }

	

	Yos_tx -= yii_h;

	if ( Yos_tx < 0.0f ){ Yos_tx *= -1.0f; } 

	Zos_tx -= zii_h;

	if ( Zos_tx < 0.0f ){ Zos_tx *= -1.0f; }

	

	if( Xos_tx > box_x - Xos_tx){

		Xos_tx -= box_x;

	}

	if( Yos_tx > box_y - Yos_tx){

		Yos_tx -= box_y;

	}

	if( Zos_tx > box_z - Zos_tx){

		Zos_tx -= box_z;

	}

	rij_tx = Xos_tx*Xos_tx + Yos_tx*Yos_tx + Zos_tx*Zos_tx;

	if( rij_tx < rabvsq ){

		count_s_tx = 1;

		nbor_s_tx = 1+idx;

	}else{

		count_s_tx = 0;

		nbor_s_tx  = 0;

	}

	if( rij_tx < rcutsq_ff && rij_tx == 0.0f ){ 

		ee_thread_tx = 0.0f; 

	}else if( rij_tx < rcutsq_ff ){

		rij_tx = (sigOO*sigOO)/(rij_tx);

		

		float rij_cube = rij_tx * rij_tx  * rij_tx ;

		ee_thread_tx = 4.0 * epsOO * rij_cube *( rij_cube - 1.0f );

	}else{

		ee_thread_tx = 0.0f;

	}

	

//	__syncthreads();

	Xo_d[idx] 		= Xos_tx;

	count_d[idx] 	= count_s_tx;

	ee_d[idx] 		= ee_thread_tx;

	nbor_d[idx] 	= nbor_s_tx; 

}

above kernel uses resource

lmem = 0 

smem = 80

reg  = 12

occuapncy is 100 %

However both kernel are of no difference in performance

shared memory based: 0.098591 (ms)

register based: 0.096469 (ms)

so my suggestion is

try larger dataset to take advantage of

(1) 10GB/s bandwidth of TeslaC0160 and

(2) 900 Gflop of Telsla C1060

your problem N = 2560 is very small, I think that all your data can be fited into L2 cache (4MB or 6MB for quadcore),

that is why CPU has good performance.

NAMD · November 24, 2009, 4:28am

I take Tesla C1060 as an example

(1) resource usage of your kernel is
lmem = 0

smem = 7248

reg  = 13 
occupancy is 50 % , say 512 threads per SM

this number (>192 threads ) is good, it can hide pipeline latency.

(2) from your code, global R/W is coalesced and no bank-conflict in shared memory,

however you have only activate 10 SM (TeslaC1060 has 30 SM).

this means that your problem under-utilize GPU power.

you can try larger dataset for example, N = 256 * 200 such that you can use all resource in GPU

(3) in fact, you don’t need shared memory, I re-write your code but use register
__global__ void ljondevice(float *Xo_d,float *Yo_d,float *Zo_d,float *ee_d,float xii_h,float yii_h,float zii_h, 

			int mol,int *count_d,int *nbor_d )

{ 

	float sigOO = 3.166f;

	float epsOO = 78.202f;

	float box_x = 36.182f;

	float box_y = 36.182f;

	float box_z = 150.0f;

	float rcutsq_ff = 100.0f;

	float rabvsq = 12.25f;

	

	float Xos_tx;

	float Yos_tx;

	float Zos_tx;

	float count_s_tx;

	int nbor_s_tx;

	float rij_tx;

	float ee_thread_tx;

	int tx = threadIdx.x;

	int idx = blockIdx.x*blockDim.x + threadIdx.x;

	Xos_tx = Xo_d[idx];

	Yos_tx = Yo_d[idx];

	Zos_tx = Zo_d[idx];

	count_s_tx = count_d[idx];

	nbor_s_tx = nbor_d[idx];

//	__syncthreads();

	Xos_tx -= xii_h;

	if ( Xos_tx < 0.0f ){ Xos_tx *= -1.0f; }

	

	Yos_tx -= yii_h;

	if ( Yos_tx < 0.0f ){ Yos_tx *= -1.0f; } 

	Zos_tx -= zii_h;

	if ( Zos_tx < 0.0f ){ Zos_tx *= -1.0f; }

	

	if( Xos_tx > box_x - Xos_tx){

		Xos_tx -= box_x;

	}

	if( Yos_tx > box_y - Yos_tx){

		Yos_tx -= box_y;

	}

	if( Zos_tx > box_z - Zos_tx){

		Zos_tx -= box_z;

	}

	rij_tx = Xos_tx*Xos_tx + Yos_tx*Yos_tx + Zos_tx*Zos_tx;

	if( rij_tx < rabvsq ){

		count_s_tx = 1;

		nbor_s_tx = 1+idx;

	}else{

		count_s_tx = 0;

		nbor_s_tx  = 0;

	}

	if( rij_tx < rcutsq_ff && rij_tx == 0.0f ){ 

		ee_thread_tx = 0.0f; 

	}else if( rij_tx < rcutsq_ff ){

		rij_tx = (sigOO*sigOO)/(rij_tx);

		

		float rij_cube = rij_tx * rij_tx  * rij_tx;

		ee_thread_tx = 4.0 * epsOO * rij_cube *( rij_cube - 1.0f );

	}else{

		ee_thread_tx = 0.0f;

	}

	

//	__syncthreads();

	Xo_d[idx] 		= Xos_tx;

	count_d[idx] 	= count_s_tx;

	ee_d[idx] 		= ee_thread_tx;

	nbor_d[idx] 	= nbor_s_tx; 

}
above kernel uses resource
lmem = 0 

smem = 80

reg  = 12
occuapncy is 100 %

However both kernel are of no difference in performance

shared memory based: 0.098591 (ms)

register based: 0.096469 (ms)

so my suggestion is

try larger dataset to take advantage of

(1) 10GB/s bandwidth of TeslaC0160 and

(2) 900 Gflop of Telsla C1060

your problem N = 2560 is very small, I think that all your data can be fited into L2 cache (4MB or 6MB for quadcore),

that is why CPU has good performance.

NAMD · November 24, 2009, 4:31am

Thanks a ton for your detailed reply.I will try out your valuable suggestion.

Thanks again

I take Tesla C1060 as an example

(1) resource usage of your kernel is
lmem = 0

smem = 7248

reg  = 13 
occupancy is 50 % , say 512 threads per SM

this number (>192 threads ) is good, it can hide pipeline latency.

(2) from your code, global R/W is coalesced and no bank-conflict in shared memory,

however you have only activate 10 SM (TeslaC1060 has 30 SM).

this means that your problem under-utilize GPU power.

you can try larger dataset for example, N = 256 * 200 such that you can use all resource in GPU

(3) in fact, you don’t need shared memory, I re-write your code but use register
__global__ void ljondevice(float *Xo_d,float *Yo_d,float *Zo_d,float *ee_d,float xii_h,float yii_h,float zii_h, 

			int mol,int *count_d,int *nbor_d )

{ 

	float sigOO = 3.166f;

	float epsOO = 78.202f;

	float box_x = 36.182f;

	float box_y = 36.182f;

	float box_z = 150.0f;

	float rcutsq_ff = 100.0f;

	float rabvsq = 12.25f;

	

	float Xos_tx;

	float Yos_tx;

	float Zos_tx;

	float count_s_tx;

	int nbor_s_tx;

	float rij_tx;

	float ee_thread_tx;

	int tx = threadIdx.x;

	int idx = blockIdx.x*blockDim.x + threadIdx.x;

	Xos_tx = Xo_d[idx];

	Yos_tx = Yo_d[idx];

	Zos_tx = Zo_d[idx];

	count_s_tx = count_d[idx];

	nbor_s_tx = nbor_d[idx];

//	__syncthreads();

	Xos_tx -= xii_h;

	if ( Xos_tx < 0.0f ){ Xos_tx *= -1.0f; }

	

	Yos_tx -= yii_h;

	if ( Yos_tx < 0.0f ){ Yos_tx *= -1.0f; } 

	Zos_tx -= zii_h;

	if ( Zos_tx < 0.0f ){ Zos_tx *= -1.0f; }

	

	if( Xos_tx > box_x - Xos_tx){

		Xos_tx -= box_x;

	}

	if( Yos_tx > box_y - Yos_tx){

		Yos_tx -= box_y;

	}

	if( Zos_tx > box_z - Zos_tx){

		Zos_tx -= box_z;

	}

	rij_tx = Xos_tx*Xos_tx + Yos_tx*Yos_tx + Zos_tx*Zos_tx;

	if( rij_tx < rabvsq ){

		count_s_tx = 1;

		nbor_s_tx = 1+idx;

	}else{

		count_s_tx = 0;

		nbor_s_tx  = 0;

	}

	if( rij_tx < rcutsq_ff && rij_tx == 0.0f ){ 

		ee_thread_tx = 0.0f; 

	}else if( rij_tx < rcutsq_ff ){

		rij_tx = (sigOO*sigOO)/(rij_tx);

		

		float rij_cube = rij_tx * rij_tx  * rij_tx;

		ee_thread_tx = 4.0 * epsOO * rij_cube *( rij_cube - 1.0f );

	}else{

		ee_thread_tx = 0.0f;

	}

	

//	__syncthreads();

	Xo_d[idx] 		= Xos_tx;

	count_d[idx] 	= count_s_tx;

	ee_d[idx] 		= ee_thread_tx;

	nbor_d[idx] 	= nbor_s_tx; 

}
above kernel uses resource
lmem = 0 

smem = 80

reg  = 12
occuapncy is 100 %

However both kernel are of no difference in performance

shared memory based: 0.098591 (ms)

register based: 0.096469 (ms)

so my suggestion is

try larger dataset to take advantage of

(1) 10GB/s bandwidth of TeslaC0160 and

(2) 900 Gflop of Telsla C1060

your problem N = 2560 is very small, I think that all your data can be fited into L2 cache (4MB or 6MB for quadcore),

that is why CPU has good performance.

NAMD · November 27, 2009, 9:53pm

Hi , we do our simulations for a particle number ranging from 2000 - 6000.I can’t increase my dataset to a very large number.Is there still a way out,to get the benefit of a GPU for my simulations.

LSChien · November 28, 2009, 3:38am

your timing measurement (code below) is not proper,

// create and start timer

unsigned int timer = 0;

cutCreateTimer(&timer);

cutStartTimer(timer);

dim3 dimGrid(10,1,1);

dim3 dimBlock(256,1,1);

ljondevice <<< dimGrid ,dimBlock >>> (Xo_d ,Yo_d ,Zo_d , ee_d ,xii_d,yii_d,zii_d,mol,count_d,nbor_d);

CUT_CHECK_ERROR("Kernel execution failed");

cudaMemcpy( ee_h,ee_d ,nBytes,cudaMemcpyDeviceToHost);

cudaMemcpy( Xo_h,Xo_d ,nBytes,cudaMemcpyDeviceToHost);

cudaMemcpy( count_h,count_d ,iBytes,cudaMemcpyDeviceToHost);

cudaMemcpy( nbor_h,nbor_d ,iBytes,cudaMemcpyDeviceToHost);

// stop and destroy timer

cutStopTimer(timer);

since you measure kernel time and data transafer together.

your kernel is memory-bound ( float-operation / (memory R/W) is low ) and

Tesla C1060 has effective bandwidth 80 GB/s (theorentical bandwidth is 100GB/s).

However PCI express Gen2 has 2~3 GB/s for non-pinned memory transfer, your timing

is dominanted by PCI transfer.

I suggest that you shoudl measure performance of kernel first

dim3 dimGrid(10,1,1);

dim3 dimBlock(256,1,1);

	int maxite  = 1000; 

// create and start timer

	unsigned int timer = 0;

	cutCreateTimer(&timer);

	cutStartTimer(timer);

// Launch kernel

	for(int i = 0; i < maxite; i++){

		ljondevice <<< dimGrid ,dimBlock >>> (Xo_d ,Yo_d ,Zo_d , ee_d ,xii_d,yii_d,zii_d,mol,count_d,nbor_d);

	}		   

	CUT_CHECK_ERROR("Kernel execution failed");

	cudaThreadSynchronize();

// stop and destroy timer

	cutStopTimer(timer);

	printf("Processing time: %f (ms) \n", cutGetTimerValue(timer)/((double)maxite));

	cutDeleteTimer(timer);

second, in your kernel, global read/write

// read 

	Xos[tx] = Xo_d[idx];   // float *Xo_d

	Yos[tx] = Yo_d[idx];   // float *Yo_d 

	Zos[tx] = Zo_d[idx];   // float *Zo_d

	count_s[tx] = count_d[idx]; // int *count_d

	nbor_s[tx] = nbor_d[idx]; // int *nbor_d 

// write

	Xo_d[idx] = Xos[tx];  // float *Xo_d

	count_d[idx] = count_s[tx]; // int *count_d

	ee_d[idx] = ee_thread[tx]; // float *ee_d

	nbor_d[idx] = nbor_s[tx]; // 	int *nbor_d

total transfer: 5N float and 4N int (or 36*N bytes )

and we measure bandwidth = 36N / (kernel time)

case 1 (share memory-based): N = 256*10, grid = 10, thread block = 256

time = 0.009768 (ms), bandwidth = 8.787183 (GB/s)

case 2 (share memory-based): N = 256 * 20, grid = 20, thread block = 256

time = 0.009770 (ms), bandwidth = 17.569387 (GB/s)

case 3 (share memory-based): N = 256 * 30, grid = 30, thread block = 256

time = 0.010152 (ms), bandwidth = 25.362903 (GB/s)

case 4 (share memory-based): N = 256 * 300, grid = 300, thread block = 256

time = 0.056808 (ms), bandwidth = 45.326916 (GB/s)

(register-based), time = 0.030667 (ms), bandwidth = 83.962978 (GB/s)

case 3 and case 4 fully utilize all SM, so its performance is very good, especially

when using register-based, case 4 reaches maximum bandwidth (kernel is memory-bound)

NAMD · November 28, 2009, 4:00pm

Thank you for your reply.I will try to see the performance

Topic		Replies	Views
Optimization of Molecular Dynamics Code. (S CUDA Programming and Performance	14	7553	October 16, 2009
Putting the GPU at work CUDA Programming and Performance	21	20263	July 5, 2007
Cuda program results are always zero in HW, correct in EMU? CUDA Programming and Performance	35	11323	May 23, 2010
very large data set (big matrix) CUDA Programming and Performance	10	3072	October 17, 2009
Fermi question CUDA Programming and Performance	30	5684	May 26, 2010
Help on fixing some poor performances (rookie) CUDA Programming and Performance	10	7197	November 28, 2007
Monte Carlo Transport CUDA Programming and Performance	14	15797	August 17, 2007
Is GPU worth it? GPU currently too slow. CUDA Programming and Performance	16	6084	December 8, 2008
modifying simulator to work with CUDA.?. CUDA Programming and Performance	36	18621	April 15, 2009
Error in Textue CUDA Programming and Performance	17	7542	September 13, 2009

Monte Carlo simulations on GPU

Related topics