Is this prime number counting algorithm a good candidate for CUDA?

I’ve done a fair bit of numerical (particularly number theory) programming in C/C++, but I haven’t dipped my toes into harnessing my graphics card for my projects yet.

Can anyone tell me how difficult it would be to get the following approach running in CUDA, or if it would even be worth trying?

I’ve included a single-threaded C implementation of the algorithm at the bottom of this post - it’s about 130 lines long. Empirically, it runs in something like O(n^(4/5)) time and O(log_2 n) space (for the stack size) - basically, it uses a fixed amount of memory for a write-once cache at program start-up for a wheel that is read-only during the algorithm’s execution, and then the main program execution is more or less purely functional (and should be relatively easy to parallelize because of that, is my hunch). On my PC, this computes the primes <= 10^13 in around 11 seconds, which is not shabby for an algorithm that doesn’t allocate any non-stack memory. 64-bit integers are essential, but probably not higher.

For the sake of explanation, the basic identity, expressed in Mathematica, is

d[n_, 0, a_] := 1
d[n_, 1, a_] := Floor[n] - a
d[n_, k_, a_] := d[n,k,a]= 
 Sum[Binomial[k, j] d[Floor[n/(m^(k - j))], j, m], {m, a + 1, n^(1/k)}, {j, 0, k - 1}]

RiemannPrimeCounting[n_] := Sum[(-1)^(k + 1)/k  d[n, k, 1], {k, 1, Log[2, n]}]
CountPrimes[ n_] := Sum[MoebiusMu[k] RiemannPrimeCounting[n^(1/k)]/k, {k, 1, Log[2, n]}]

That’s the essence of the math.

Then, crucially, to really speed things up, we apply a wheel. The core idea of the wheel is that, for the function named “d” here, we can iterate over only numbers not divisible by the first x primes, which is much faster. The general idea looks something like this.

(* For the following block of code, all you really need to know are what 
  the following 2 functions do.
  -"Use[n]" is 1 if n is not divisible by the first WheelEntries primes, 0 otherwise
  -"Coprimes[n]" is the sum of Use[1] through Use[n] *)
WheelEntries = 5;
WheelSize := Product[Prime[j], {j, 1, WheelEntries}];
CoprimeCache := CoprimeCache=Table[CoprimeQ[WheelSize, n], {n, 1, WheelSize}]
Use[n_] := Use[n]=If[CoprimeCache[[Mod[n - 1, WheelSize] + 1]] == True, 1, 0]
LegendrePhi[x_, a_] := LegendrePhi[x,a]=LegendrePhi[x, a - 1] - LegendrePhi[x/Prime[a], a - 1]
LegendrePhi[x_, 0] := Floor[x]
LegPhiCache := LegPhiCache = Table[LegendrePhi[n, WheelEntries], {n, 1, WheelSize}]
FullWheel := LegendrePhi[WheelSize, WheelEntries];
Coprimes[n_] := LegPhiCache[[Mod[n - 1, WheelSize] + 1]] + Floor[(n - 1)/WheelSize] FullWheel

(* There are only 3 changes from the previous version of this code 
 -the use of "Coprimes" in d[n,1,a]
 -the use of "Use" in d[n,k,a]
 -the addition of "WheelEntries" in CountPrimes.  *)
d[n_, 0, a_] := 1
d[n_, 1, a_] := d[n,1,a]=Coprimes[n] - Coprimes[a]
d[n_, k_, a_] := d[n,k,a]=
 Sum[If[ Use[m] == 0, 0, Binomial[k, j] d[Floor[n/(m^(k - j))], j, m]], {m, a + 1, n^(1/k)}, {j, 0, k - 1}]
RiemannPrimeCounting[n_] := Sum[(-1)^(k + 1)/k  d[Floor[n], k, 1], {k, 1, Log[2, n]}]
CountPrimes[ n_] := WheelEntries + Sum[MoebiusMu[k] RiemannPrimeCounting[n^(1/k)]/k, {k, 1, Log[2, n]}]

So that’s the approach. Here’s a C implementation of the same idea:

#include "stdlib.h"
#include "math.h"
#include "time.h"
#include "stdio.h"

const int wheelLargestPrime =        19;                     /* The largest prime excluded by the wheel.  Note that the wheel will require primorial( wheelLargestPrime ) * 8 bytes of RAM.*/
const int binomialLayersToCache =    30;                    /* Number of Binomials to cache.  This is probably enough for most values.*/
const double EPSILON =               .00000000001;        /* Added to account for floating point errors when converting from doubles to long longs*/

int primes[] = { 0, 2,3,5,7,11,13,17, 19,23,29,31,37,41,43,47,53,59};
long long mu[] = { 0, 1, -1, -1, 0, -1, 1, -1, 0, 0, 1, -1, 0, -1, 1, 1, 0, -1, 0, -1, 0, 1, 1, -1, 0, 0, 1, 0, 0, -1, -1};

long long binomial[binomialLayersToCache][binomialLayersToCache];
void MakeBinomial(){
    for( int k = 0; k < binomialLayersToCache; k++ )
        for( int j = 0; j <= k; j++ ){ 
            double m = 1;
            for( double i = 1; i <= j; i++ )m *= ( k - ( j - i ) ) / i;
            binomial[ k ][ j ] = long long( m + .5 );
        }
}

int primesToSieve, firstNonSieveEntry;
int wheelCycleEntries, wheelCyclePeriod, wheelFirstPrime, wheelBasePrimes;
int *wheelTranslation = 0, *wheelOffsets = 0;
void MakeWheel(){
    wheelBasePrimes = 1;
    while( primes[ wheelBasePrimes ] <= wheelLargestPrime )wheelBasePrimes++;
    wheelCyclePeriod = 1;
    int i = 1;
    while( primes[ i ] <= wheelLargestPrime ){
        wheelCyclePeriod *= primes[ i ];
        i++;
    }
    wheelCycleEntries = 0;

    int cur = 0, offset = -1, *wheelBase = 0;

    wheelBase = ( int* )malloc( wheelCyclePeriod * sizeof( int ) );
    wheelTranslation = ( int* )malloc( ( wheelCyclePeriod + 1 ) * sizeof( int ) );
    wheelOffsets = ( int * )malloc( wheelCyclePeriod * sizeof( int ) );
    i = 1;
    while( primes[ i ] <= wheelLargestPrime )i++;
    wheelFirstPrime = primes[ i ];

    for( int i = 0; i < wheelCyclePeriod; i++ ){
        wheelBase[ i ] = 1;
        for( int j = 2; j <= wheelLargestPrime; j++ )
            if( !( ( i + 1 ) % j ) ){
                wheelBase[ i ] = 0;
                break;
            }
    }
    while( cur < wheelCyclePeriod ){
        if( wheelBase[ cur ] && cur != 0 ){
            wheelOffsets[ wheelCycleEntries ] = offset + 1;
            offset = 0;
            wheelCycleEntries++;
        }
        else offset++;
        cur++;
    }
    wheelOffsets[ wheelCycleEntries ] = 2;
    wheelCycleEntries++;	
    int total = 0;
    wheelTranslation[ 0 ] = 0;
    for( long long i = 0; i < wheelCyclePeriod; i++ ){
        if( i && wheelBase[ i - 1 ] )total++;
        wheelTranslation [ i + 1 ] = total;
    }
    free( wheelBase );
    firstNonSieveEntry = wheelFirstPrime; 
    primesToSieve = wheelBasePrimes - 1;
}
inline void IncrementWheel( int &numberRangeID ){
    numberRangeID++;
    if( numberRangeID >= wheelCycleEntries )numberRangeID = 0;
}
inline long long WheelEntriesForRange( long long rangeEnd ){
    rangeEnd++;
    int b = rangeEnd % wheelCyclePeriod;
    return ( rangeEnd - b ) / wheelCyclePeriod * wheelCycleEntries + wheelTranslation[ b ];
}
inline void IncrementNumber( long long &n, int &numberRangeID ){
    IncrementWheel( numberRangeID );
    n += wheelOffsets[ numberRangeID ];
}

long long D( int k, long long a, int numberRangeID, long long n ){
    if( k == 0 )return 1; 
    if( k == 1 )return WheelEntriesForRange( n ) - WheelEntriesForRange( a )+1;
    long long t = 0;
    while( a <= pow(n, 1.0 / k) + EPSILON ){
        long long nn = n/a;
        long long pa = a;
        IncrementNumber( a, numberRangeID );
        t += binomial[k][k];
        for( int j = 1; j < k; j++ ){
            t += D( k-j, a, numberRangeID, nn )*binomial[k][j];
            nn /= pa;
        }
    }
    return t;
}
long long CountPrimes(long long n){ 
    double t = 0.0; 
    for (int j = 1; j < log((double)n) / log((double)firstNonSieveEntry); j++)
        for (int k = 1; k < log( pow( n, 1.0 / j ) ) / log((double)firstNonSieveEntry); k++)
            t += pow( -1.0, (double)k + 1 ) * D(k, firstNonSieveEntry, 0, long long( pow(n, 1.0 / j) + EPSILON ) ) / k / j * mu[j];
    return t + .5 + primesToSieve;
}

int main(int argc, char* argv[]){
    MakeBinomial();
    MakeWheel();

    int oldClock = (int)clock(), lastDif = 0;
	char* spaces = "                                                                    ";
	printf("%sTime\n%sIncrease\n%sfor x10\n", spaces, spaces, spaces);
    printf( "         __ Input Number __   __ Output Number __ _ MSec _ _ Sec _  Input\n\n");
    for( long long i = 10; i <= 1000000000000000000; i *= 10 ){
        printf( "%17I64d(10^%4.1f): ", i, log( (double)i )/log(10.0) );
        long long total = CountPrimes( i );
        int newClock = (int)clock();
        printf( " %20I64d %8d : %4d: x%f\n", total, newClock - oldClock, ( newClock - oldClock ) / CLK_TCK,
            ( lastDif ) ? (double)( newClock - oldClock ) / (double)lastDif : 0.0 );
        lastDif = newClock - oldClock;
        oldClock = newClock;
    }
    return 0;
}

So what do you folks think? How much trouble would I likely run into trying to CUDA-ize this algorithm? And what kind of speed-up could I expect from going down that road?

Thanks!