I wonder what exactly causes the performance difference between two version of the same kernel. First kernel uses temporary variables (float4 tPos, tVel) which are stored in registers (~408 fps). Second kernel reads a data directly from the global memory (~426 fps). I think that there is too little operations which using registers (tPos, tVel) to achieve some benefits of the low latency. Generally the kernel seems to be suitable to coalesced memory access so maybe it causes better performance in the second case (but I’m not sure how using registers affect to coalesced memory access in the first case).
The number of particles = 1048576, threads per block = 128, blocks per grid = 8192.
First kernel:
__global__ void particles_kernel(float4 *vbo, float4* pos, float4* vel, int np)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < np)
{
float4 tPos = pos[tid];
float4 tVel = vel[tid];
tPos.x += tVel.x;
tPos.y += tVel.y;
tPos.z += tVel.z;
if(tPos.x < -3.0f || tPos.x > 3.0f )
{
tVel.x = -tVel.x;
}
if(tPos.y < -3.0f || tPos.y > 3.0f)
{
tVel.y = -tVel.y;
}
if(tPos.z < -3.0f || tPos.z > 3.0f)
{
tVel.z = -tVel.z;
}
pos[tid] = tPos;
vel[tid] = tVel;
vbo[tid] = make_float4(tPos.x, tPos.y, tPos.z, 1.0f);
}
}
Second kernel:
__global__ void particles_kernel(float4 *vbo, float4 *pos, float4 *vel, int np)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < np)
{
pos[tid].x += vel[tid].x;
pos[tid].y += vel[tid].y;
pos[tid].z += vel[tid].z;
if(pos[tid].x < -3.0f || pos[tid].x > 3.0f)
{
vel[tid].x = -vel[tid].x;
}
if(pos[tid].y < -3.0f || pos[tid].y > 3.0f)
{
vel[tid].y = -vel[tid].y;
}
if(pos[tid].z < -3.0f || pos[tid].z > 3.0f)
{
vel[tid].z = -vel[tid].z;
}
vbo[tid] = make_float4(pos[tid].x, pos[tid].y, pos[tid].z, 1.0f);
}
}