Runtime: real, user, system

A beautiful day,

i wrote the following program and i get as result for the runtime

real:47 sec , user: 9 sec, sys: 33 sec

Is there a chance to decrease the system time, by

  1. switching off the graphic (i only make a calculation on the gpu)

  2. improving the program

I use ubuntu 9.0.4 and did not succeeded in running the cuda-gdb debugger.

There is a problem with glibc-2.11

Thanks in advance for every hint.

The program looks like this:

// Version o.k. Invertierung auf Prozessor

// real 47 sec

// 56 Sequenzen; <1 sec / Seq

#include <stdio.h>

#include <math.h>

#define Modul 2357111317*64

#define Start_Prime 19

#define Cores 192

#define Sequ_max Modul

#define Feld_max Modul

#define Liste_max 2013000

typedef int sequ [Sequ_max];

typedef int feld [Feld_max];

typedef int liste [Liste_max];

typedef int liste_inv [Liste_max];

typedef int results [Cores];

typedef int begin [1];

long long modInverse(long long a, long long n)


long long i = n, v = 0, d = 1;

while (a>0)


 int t = i/a, x = a;

 a = i-t*a;

 i = x;

 x = d;

 d = v - t*x;

 v = x;


v %= n;

if (v<0) v+=n;

return v;


device void Setbit (int* s)


 atomicAdd (s, 1);


device int Exch (int* s)


 return (atomicExch (s, 0));


device int Fetch (int* s)


 return (atomicAdd (s, 0));


global void Sieve(sequ* A, liste* L, liste_inv* Inv, begin* B)


int i      = threadIdx.x;

long long b= Fetch (&(*B)[0]);

int p      = Fetch (&(*L)[i]);

int p_inv  = Fetch (&(*Inv)[i]);

long long start;

while (p>0)




while (start<Modul)


        Setbit (&(*A)[start]);




    p     = (*L)[i];

    p_inv = (*Inv)[i];



global void Count(sequ* A, results* R)


int z=0;

int i = threadIdx.x;

int j=i;

int r;

while (i<Modul)


    r = Exch (&(*A)[i]);

    if (r==0) z++;



// Rückgabewert ins Array eintragen



long long gcd (long long a, long long b)


int c;

while ( a != 0 )


 c = a; a = b%a;  b = c;


return b;


void primes (feld *a, liste *l, liste_inv *inv)


long long start, j;

long long beg;

for (long long i=3; i<Feld_max; i+=2)




for (long long i=3, sqrt_Feld_max=sqrt (Feld_max); i<sqrt_Feld_max; i++)


  if ((*a)[i]==1)



     while (start<Feld_max)







// Anlegen von Liste der Primzahlen


for (long long i=Start_Prime; i<Feld_max; i+=2)





beg=i*modInverse(i, Modul);







// Terminierung mit 0 für jeden Core

for (long long i=0; i<Cores; i++)





printf (“Liste_max = %i\n”, Modul);

printf (“Liste mit Terminierung = %lli\n”, j-1);


int main()


   long long ges_resultat, r;

sequ h_A = (sequ) malloc(sizeof(sequ));

   feld      *h_Feld      = (feld*)      malloc(sizeof(feld));

   liste     *h_Liste     = (liste*)     malloc(sizeof(liste));

   liste_inv *h_Liste_inv = (liste_inv*) malloc(sizeof(liste_inv));

   results   *h_R         = (results*)   malloc(sizeof(results));

   begin     *h_B         = (begin*)     malloc(sizeof(begin));

sequ* d_A;

   if (cudaMalloc(&d_A, sizeof (sequ))!=cudaSuccess) exit(1);

liste* d_Liste;

   if (cudaMalloc(&d_Liste, sizeof (liste))!=cudaSuccess) exit (1);

liste_inv* d_Liste_inv;

   if (cudaMalloc(&d_Liste_inv, sizeof (liste_inv))!=cudaSuccess) exit (1);

results* d_R;

   if (cudaMalloc(&d_R, sizeof (results))!=cudaSuccess) exit (1);

begin* d_B;

   if (cudaMalloc(&d_B, sizeof (begin))!=cudaSuccess) exit (1);

// Initialize

   for (long long i=0; i<Feld_max; i++)




// Primeliste berechnen

   primes (h_Feld, h_Liste, h_Liste_inv);

   printf ("Primes precalculated\n");

// Copy vectors from host memory to device memory

if (cudaMemcpy(d_Liste, h_Liste, sizeof (liste), cudaMemcpyHostToDevice)!=cudaSuccess) exit(1);

   if (cudaMemcpy(d_Liste_inv, h_Liste_inv, sizeof (liste),     cudaMemcpyHostToDevice)!=cudaSuccess) exit(1);       

   if (cudaMemcpy(d_A,         h_A,         sizeof (sequ),      cudaMemcpyHostToDevice)!=cudaSuccess) exit(1);

for (long long l=1; l<300; l+=2)


      if (gcd (l, Modul)==1)


printf (“Startsequence = %lli\n”, l);

(*h_B)[0] =l;

if (cudaMemcpy(d_B, h_B, sizeof (begin), cudaMemcpyHostToDevice)!=cudaSuccess) exit(1);

// Paralleles Berechnen auf der Cudakarte

        printf ("Sieving\n");

        Sieve<<<1, Cores>>>(d_A, d_Liste, d_Liste_inv, d_B);

printf (“Count\n”);

        Count<<<1, Cores>>>(d_A, d_R);

// Kopieren der Resultate zum Host

        cudaMemcpy(h_R, d_R, sizeof (results), cudaMemcpyDeviceToHost);

// Zusammenaddieren der Ergebnisse auf dem Host


        for (int i=0; i<Cores; i++)





        printf ("Ges %lli\n", ges_resultat);



// Free device memory

// cudaFree(d_A);

// cudaFree(d_Liste);

// Free host memory

// free (h_A);

// free (h_Feld);

// free (h_Liste);


Nice Greetings from the primes

hero of primes

System time stands for the time spent by your program in the NVIDIA driver part.
I hope your program spends more time in Kernel Execution than cudaMalloc, cudaMemcpys and the likes.
If that is indeed the case, it is possible that cudaRT library is busy polling the NVIDIA driver for Kernel Execution completion.
Just set “cudaSetDeviceFlags” to “block”. This will make sure that NV driver puts ur application to sleep while waiting for kernel execution.
This can bring down system-time substantially (hopefully).
Make sure your system is not running other IO intensive programs (like anti-virus, disk scans, network IO etc.)