i’m new in CUDA, i wrote a program to determine the number of PI
__global__ void Pi(float* out, long N, int nthreads, int nblocks)
{
float i;
float x;
float licznik = 4.0;
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx%2!=0)
licznik = -4.0;
for (i = idx; i <= N; i += nthreads * nblocks)
{
out[idx] += licznik/(2 * i + 1);
}
}
...
You are exceeding the useable precision of 32 bit floating point arithmetic, and the way you are doing summation in the kernel is not helping either, especially if you are using a lot of threads.
for 100 1000 10000 decimal if you want all in integer
i dont know if it s easy to do in cuda
Sub a()
Dim ta1(50) As Long
For i = 1 To 50
ta1(i) = 0
Next i
ta1(1) = 1
For i = 500 To 1 Step -1
For j = 50 To 1 Step -1
ta1(j) = ta1(j) * i + reste
reste = 0
If ta1(j) > 10000 Then
reste = Int(ta1(j) / 10000)
ta1(j) = ta1(j) - reste * 10000
End If
Next j
reste = 0
For j = 1 To 50
reste2 = (ta1(j) + reste * 10000) Mod (2 * i + 1)
ta1(j) = Int((reste * 10000 + ta1(j)) / (2 * i + 1))
reste = reste2
Next j
ta1(1) = ta1(1) + 1
reste = 0
Next i
End Sub