A beautiful day,
i wrote the following program and i get as result for the runtime
real:47 sec , user: 9 sec, sys: 33 sec
Is there a chance to decrease the system time, by
-
switching off the graphic (i only make a calculation on the gpu)
-
improving the program
I use ubuntu 9.0.4 and did not succeeded in running the cuda-gdb debugger.
There is a problem with glibc-2.11
Thanks in advance for every hint.
The program looks like this:
// Version o.k. Invertierung auf Prozessor
// real 47 sec
// 56 Sequenzen; <1 sec / Seq
#include <stdio.h>
#include <math.h>
#define Modul 2357111317*64
#define Start_Prime 19
#define Cores 192
#define Sequ_max Modul
#define Feld_max Modul
#define Liste_max 2013000
typedef int sequ [Sequ_max];
typedef int feld [Feld_max];
typedef int liste [Liste_max];
typedef int liste_inv [Liste_max];
typedef int results [Cores];
typedef int begin [1];
long long modInverse(long long a, long long n)
{
long long i = n, v = 0, d = 1;
while (a>0)
{
int t = i/a, x = a;
a = i-t*a;
i = x;
x = d;
d = v - t*x;
v = x;
}
v %= n;
if (v<0) v+=n;
return v;
}
device void Setbit (int* s)
{
atomicAdd (s, 1);
}
device int Exch (int* s)
{
return (atomicExch (s, 0));
}
device int Fetch (int* s)
{
return (atomicAdd (s, 0));
}
global void Sieve(sequ* A, liste* L, liste_inv* Inv, begin* B)
{
int i = threadIdx.x;
long long b= Fetch (&(*B)[0]);
int p = Fetch (&(*L)[i]);
int p_inv = Fetch (&(*Inv)[i]);
long long start;
while (p>0)
{
start=p_inv*b;
start%=p;
while (start<Modul)
{
Setbit (&(*A)[start]);
start+=p;
}
i+=Cores;
p = (*L)[i];
p_inv = (*Inv)[i];
}
}
global void Count(sequ* A, results* R)
{
int z=0;
int i = threadIdx.x;
int j=i;
int r;
while (i<Modul)
{
r = Exch (&(*A)[i]);
if (r==0) z++;
i+=Cores;
}
// Rückgabewert ins Array eintragen
(*R)[j]=z;
}
long long gcd (long long a, long long b)
{
int c;
while ( a != 0 )
{
c = a; a = b%a; b = c;
}
return b;
}
void primes (feld *a, liste *l, liste_inv *inv)
{
long long start, j;
long long beg;
for (long long i=3; i<Feld_max; i+=2)
{
(*a)[i]=1;
}
for (long long i=3, sqrt_Feld_max=sqrt (Feld_max); i<sqrt_Feld_max; i++)
{
if ((*a)[i]==1)
{
start=i*i;
while (start<Feld_max)
{
(*a)[start]=0;
start+=2*i;
}
}
}
// Anlegen von Liste der Primzahlen
j=0;
for (long long i=Start_Prime; i<Feld_max; i+=2)
{
if((*a)[i]==1)
{
(*l)[j]=i;
beg=i*modInverse(i, Modul);
beg-=1;
beg/=Modul;
(*inv)[j]=beg;
j++;
}
}
// Terminierung mit 0 für jeden Core
for (long long i=0; i<Cores; i++)
{
(*l)[j]=0;
j++;
}
printf (“Liste_max = %i\n”, Modul);
printf (“Liste mit Terminierung = %lli\n”, j-1);
}
int main()
{
long long ges_resultat, r;
sequ h_A = (sequ) malloc(sizeof(sequ));
feld *h_Feld = (feld*) malloc(sizeof(feld));
liste *h_Liste = (liste*) malloc(sizeof(liste));
liste_inv *h_Liste_inv = (liste_inv*) malloc(sizeof(liste_inv));
results *h_R = (results*) malloc(sizeof(results));
begin *h_B = (begin*) malloc(sizeof(begin));
sequ* d_A;
if (cudaMalloc(&d_A, sizeof (sequ))!=cudaSuccess) exit(1);
liste* d_Liste;
if (cudaMalloc(&d_Liste, sizeof (liste))!=cudaSuccess) exit (1);
liste_inv* d_Liste_inv;
if (cudaMalloc(&d_Liste_inv, sizeof (liste_inv))!=cudaSuccess) exit (1);
results* d_R;
if (cudaMalloc(&d_R, sizeof (results))!=cudaSuccess) exit (1);
begin* d_B;
if (cudaMalloc(&d_B, sizeof (begin))!=cudaSuccess) exit (1);
// Initialize
for (long long i=0; i<Feld_max; i++)
{
(*h_A)[i]=0;
}
// Primeliste berechnen
primes (h_Feld, h_Liste, h_Liste_inv);
printf ("Primes precalculated\n");
// Copy vectors from host memory to device memory
if (cudaMemcpy(d_Liste, h_Liste, sizeof (liste), cudaMemcpyHostToDevice)!=cudaSuccess) exit(1);
if (cudaMemcpy(d_Liste_inv, h_Liste_inv, sizeof (liste), cudaMemcpyHostToDevice)!=cudaSuccess) exit(1);
if (cudaMemcpy(d_A, h_A, sizeof (sequ), cudaMemcpyHostToDevice)!=cudaSuccess) exit(1);
for (long long l=1; l<300; l+=2)
{
if (gcd (l, Modul)==1)
{
printf (“Startsequence = %lli\n”, l);
(*h_B)[0] =l;
if (cudaMemcpy(d_B, h_B, sizeof (begin), cudaMemcpyHostToDevice)!=cudaSuccess) exit(1);
// Paralleles Berechnen auf der Cudakarte
printf ("Sieving\n");
Sieve<<<1, Cores>>>(d_A, d_Liste, d_Liste_inv, d_B);
printf (“Count\n”);
Count<<<1, Cores>>>(d_A, d_R);
// Kopieren der Resultate zum Host
cudaMemcpy(h_R, d_R, sizeof (results), cudaMemcpyDeviceToHost);
// Zusammenaddieren der Ergebnisse auf dem Host
ges_resultat=0;
for (int i=0; i<Cores; i++)
{
r=(*h_R)[i];
ges_resultat+=r;
}
printf ("Ges %lli\n", ges_resultat);
}
}
// Free device memory
// cudaFree(d_A);
// cudaFree(d_Liste);
// Free host memory
// free (h_A);
// free (h_Feld);
// free (h_Liste);
}
Nice Greetings from the primes
hero of primes