Hello,
If i choose 256 Particles and the Emu Mode, the results are right, but
not in Emu Mode the results are wrong.
Furthermore if i choose more than 256 particles, e.g. 512, the force_out[index]
variables in the “computeForces” Function get the value of -1.#IND00, in the second call of the
function.
Can somebody help me ?
I think i make something wrong with the block size and the thred size, but i can’t see the failure.
template.cu
#include "template_kernel.cu"
#include <cutil.h>
#include <stdio.h>
void computeParticle(float* pos,float* force, int numParticle, int p, int q)
{
int sharedMemSize = p * q * sizeof(float3);
dim3 threads(p,q,1);
dim3 grid(numParticle/p, 1, 1);
computeForces<<< grid, threads, sharedMemSize >>>((float3*)pos, (float3*)force, numParticle);
updatePosition<<< grid, threads, sharedMemSize >>>((float3*)pos, (float3*)force, numParticle);
CUT_CHECK_ERROR("Kernel execution failed");
}
void randomize(float* data,int numParticle)
{...}
int main(int argc, char** argv)
{
CUT_DEVICE_INIT(argc, argv);
int numParticle = 512; // !!!!!!!!!!!!!!!!!!!!!!!
float* h_pos = new float[numParticle*3];
float* d_pos;
float* d_force;
unsigned int size = sizeof( float) * 3 * numParticle;
memset(h_pos, 0, size);
cudaMalloc((void**)&d_pos, size);
cudaMalloc((void**)&d_force, size);
randomize(h_pos, numParticle);
cudaMemcpy(d_pos, h_pos, size, cudaMemcpyHostToDevice);
//---------------------------------------
for(int i=0;i < 2; ++i) // iteration loop
{
printf("\r%i",i);
computeParticle(d_pos, d_force, numParticle, 64, 4);
}
//---------------------------------------
cudaMemcpy(h_pos, d_pos, size, cudaMemcpyDeviceToHost);
//write to File
FILE* fp;
fp = fopen("c:\data.txt","w");
for(int i=0; i<numParticle; ++i)
{
fprintf(fp, "%f\n%f\n%f\n", h_pos[i*3], h_pos[i*3+1], h_pos[i*3+2]);
}
fclose(fp);
cudaFree((void**)d_pos);
cudaFree((void**)d_force);
delete [] h_pos;
system("pause");
}
template_kernel.cu
#ifndef _TEMPLATE_KERNEL_H_
#define _TEMPLATE_KERNEL_H_
#include <stdio.h>
__global__ void computeForces(float3* pos_in, float3* force_out, int numParticle)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
force_out[index].x = 0;
force_out[index].y = 0;
force_out[index].z = 0;
for(int i = 0; i < numParticle; ++i)
{
if(i == index)
continue;
float3 vec;
vec.x = pos_in[index].x - pos_in[i].x;
vec.y = pos_in[index].y - pos_in[i].y;
vec.z = pos_in[index].z - pos_in[i].z;
float d = sqrtf(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
if(d == 0)
d == 0.001;
vec.x *= 1/(d*d*10);
vec.y *= 1/(d*d*10);
vec.z *= 1/(d*d*10);
/*if(i%64 == 0)
{
printf("F %f\t V %f\t P %f\n", force_out[index].x, vec.x, pos_in[index].x);
}*/
force_out[index].x += vec.x;
force_out[index].y += vec.y;
force_out[index].z += vec.z;
}
}
__global__ void updatePosition(float3* pos, float3* force_in, int numParticle)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
float3 force = force_in[index];
force.x *= 1.0f/10.0f;
force.y *= 1.0f/10.0f;
force.z *= 1.0f/10.0f;
pos[index].x += force.x;
pos[index].y += force.y;
pos[index].z += force.z;
// project to Sphere radius 1
float d = sqrtf(pos[index].x * pos[index].x + pos[index].y * pos[index].y + pos[index].z * pos[index].z);
// radius = 1
pos[index].x *= 1/d;
pos[index].y *= 1/d;
pos[index].z *= 1/d;
}
#endif // #ifndef _TEMPLATE_KERNEL_H_