Hi to all,
i am new in programming with CUDA and i have a problem. i am trying to write a program following “Nbody” sdk tutorial but something goes wrong. when i launch this kernel:
[codebox]global void sinc (float4 *a_d, unsigned long int N, float intensity, float inteNew)
{
int Idx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
float4 bi = a_d[Idx];
for (int q = 1; q < Q; q ++) {
// compute intensity for this body
float inte = intensity[q];
inte += computeBodyIntensity(bi, a_d, N, inte, q);
// store intensity
inteNew[q] += inte;
}
}
device float computeBodyIntensity(float4 bi, float4* a_d, unsigned long int N, float inte, int q)
{
inte = 0.0f;
for (int j = 0; j < N; j ++)
{
inte = bbinteraction(inte, a_d[j], bi, q);
__syncthreads ();
}
return inte;
}
device float bbinteraction(float inten, float4 bj, float4 bi, int q)
{
float3 r = {bi.x - bj.x, bi.y - bj.y, bi.z - bj.z};
float distSq = r.x * r.x + r.y * r.y + r.z * r.z;
if (distSq != 0) {
// N couples not calculated
float dist = sqrt(distSq);
float step = q * Qstep * dist;
inten += sin(step) / step;
} else {
inten ++;
}
return inten;
}[/codebox]
i launch the kernel in this way:
[codebox] dim3 dimGrid(N / ThNumber, 1); // number of blocks in grid
dim3 dimBlock(ThNumber, 1, 1); // number of threads in each block
sinc <<< dimGrid, dimBlock >>> ((float4*) a_d, N, intensity, inteNew);[/codebox]
i have tested a file were N = 4 and ThNumber = 2; all fine in emu mode (for q = 1, Qstep = 0.05 → inteNew[1] = 15.990002) but not in non-emu (inteNew[1] = 3.997500). :wacko:
maybe blocks are not able to sum intensity? i can not understand what can be the problem… i have missed some __syncthreads?
it would be great for me if anyone have any idea…
thank you to all,
luca