In my program I need to calculate gravity for every created particle
with every other particle.
However I am not able to make more than 256 particles or the program hangs.
I think it has something to do with blocks in the kernel because I have to
read the data from the array for every particle in every thread.
void
updateUniverse(uint vboOldPos, uint vboNewPos, uint oldVel, uint newVel, int items)
{
float4 *oldPos, *newPos;
CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&oldPos, vboOldPos));
CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&newPos, vboNewPos));
float3 *vDOld;
float3 *vDNew;
CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&vDOld, oldVel));
CUDA_SAFE_CALL(cudaGLMapBufferObject((void**)&vDNew, newVel));
int numThreads = min(512, items);
int numBlocks = (int) ceil(items / (float) numThreads);
updateUniverseD<<< numBlocks, numThreads, sizeof(float) * items * 3 >>>(oldPos, newPos, vDOld, vDNew, items);
// check if kernel invocation generated an error
CUT_CHECK_ERROR("Kernel execution failed");
CUDA_SAFE_CALL(cudaGLUnmapBufferObject((uint) vboOldPos));
CUDA_SAFE_CALL(cudaGLUnmapBufferObject((uint) vboNewPos));
CUDA_SAFE_CALL(cudaGLUnmapBufferObject((uint) oldVel));
CUDA_SAFE_CALL(cudaGLUnmapBufferObject((uint) newVel));
}
Part from kernel.cu:
__device__
void recalcMagD(float4 * pos, float4 * posnew, float3 *posVbo, int i, int items)
{
int ii=0;
double v1x,v2x,v1z,v2z, vx ,vz, vmass, vangle, vmag, ox ,oz, omass;
double bot_distance;
double grav, gangle;
vx = pos[i].x;
vz = pos[i].z;
vmass = posVbo[i].x;
vmag = posVbo[i].y;
vangle = posVbo[i].z;
for (ii=0; ii < items; ii++)
{
if(i != ii)
{
ox = pos[ii].x;
oz = pos[ii].z;
omass = posVbo[ii].x;
bot_distance = (sqrtf(powf(ox - vx, 2)+
powf(oz - vz, 2)));
grav = (((GRAV * vmass * omass)/powf(bot_distance, 2))/vmass);
gangle = atan2f(oz - vz , ox - vx);
v1x = vx+(vmag * cosf(vangle));
v1z = vz+(vmag * sinf(vangle));
v2x = vx+(grav * cosf(gangle));
v2z = vz+(grav * sinf(gangle));
vx = vx - ((v1x + v2x))/2.0;
vz = vz - ((v1z + v2z))/2.0;
vmag = sqrtf(powf(vx , 2) + powf(vz, 2)) * 2.0;
vangle = atan2f(vz, vx);
}
}
posVbo[i] = make_float3(vmass, vmag, vangle);
posnew[i] = make_float4((pos[i].x + (vmag * cosf(vangle))), 0.0, (pos[i].z + (vmag * sinf(vangle))), 0.0);
}
__global__ void
updateUniverseD(float4 * posVboOld, float4* posVboNew, float3 *vo, float3 *vn, int items)
{
int ii = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
extern __shared__ float3 shared[];
shared[ii] = vo[ii];
__syncthreads();
recalcMagD(posVboOld, posVboNew, shared, ii, items)
vn[ii] = shared[ii];
__syncthreads();
}
I’m pretty confused, and any help is highly appreciated