I am bringing in the cavalry to hijack my thread back.

Here’s some code to sieve primes on the GPU, in Erastothenes flavour. It uses bitwise marking in shared memory to sieve. Memory races are avoided by large strides of 17 bits or more, avoiding to hit the same byte from several threads. One thread block sieves exactly 120120 primes, which fits into 15015 bytes in shared memory.

The very first thread block runs “solo” on the GPU and supplies the primes for sieving on a (possible huge) grid of following blocks. The primes determined by the first block are copied to constant memory for better speed.

Sieving up to 1 billion (1E9) primes on a 8800GT is possible in 4.6 seconds. Mind the watchdog timer on lesser machines.

Example of use (slightly abridged output):

```
buchner@linux-dv5:~/NVIDIA_CUDA_SDK/projects/primes> ../../bin/linux/release/primes --min=0 --max=40000000
Computing on GPU...
Processing time (GPU): 200.789993 (ms)
Computing on CPU...
Processing time (CPU): 640.202026 (ms)
CPU says there are 2433654 primes from 0 to including 40000000.
GPU says there are 2433654 primes from 0 to including 40000000.
```

Based on an SDK makefile, so place everything in a subfolder of your “projects” dir. Tested on CUDA SDK V 2.2

```
############################################################
#
# Build script for project
#
############################################################
# Add source files here
EXECUTABLE := primes
# CUDA source files (compiled with cudacc)
CUFILES := primes.cu
# CUDA dependency files
CU_DEPS := primes_kernel.cu
# C/C++ source files (compiled with gcc / c++)
CCFILES := primes_gold.cpp
############################################################
# Rules and targets
include ../../common/common.mk
```

Host side code:

```
/**
* Sieve of Erastothenes on CPU and GPU.
* originally based on Template project.
* Host code.
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cutil_inline.h>
// includes, kernels
#include <primes_kernel.cu>
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);
extern "C"
void computePrimes( unsigned char* bitbuffer, unsigned long long bitbufsize, unsigned long long N );
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
runTest( argc, argv);
cutilExit(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run prime sieve on CUDA and CPU
////////////////////////////////////////////////////////////////////////////////
void
runTest( int argc, char** argv)
{
unsigned int N = 120119; // largest number N in first block
unsigned int NUM = N+1; // number of integers in range [0...N]
unsigned int bitbufsize = (NUM+7)/8; // number of bytes required to hold bit pattern per block
unsigned char *bitbuffer_gold = NULL;
int min, max;
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
// TODO: possibly extend this to support 64 bit integer arguments
if (!cutGetCmdLineArgumenti( argc, (const char**) argv, "min", &min))
min = 0;
if (!cutGetCmdLineArgumenti( argc, (const char**) argv, "max", &max))
{
max = 100000000;
fprintf(stderr, "Use --max option to set upper bound.\n");
}
unsigned long long RANGEMIN = min;
unsigned long long RANGEMAX = max;
unsigned int BLOCKS = ((RANGEMAX+1) + (NUM-1)) / NUM;
fprintf(stderr, "RANGE is [%lld...%lld]\n", RANGEMIN, RANGEMAX);
fprintf(stderr, "%d BLOCKS are needed to cover that range.\n", BLOCKS);
fprintf(stderr, "Total memory requirement for bit array is %lld kB.\n", BLOCKS*bitbufsize / 1024);
fprintf(stderr, "Allocating and clearing host memory.\n");
bitbuffer_gold = (unsigned char*)calloc(1, BLOCKS*bitbufsize);
unsigned int timer = 0;
unsigned int timer2 = 0;
fprintf(stderr, "Allocating and clearing device memory.\n");
// allocate and cleardevice memory for result
unsigned char* d_bitbuffer = NULL;
cutilSafeCall( cudaMalloc( (void**) &d_bitbuffer, BLOCKS*bitbufsize));
cutilSafeCall( cudaMemset( d_bitbuffer, 0, BLOCKS*bitbufsize ) );
fprintf(stderr, "Precomputing repeating bit pattern.\n");
// allocate memory for the precomputed pattern on host side
unsigned char* h_prepattern = (unsigned char*) calloc(1, bitbufsize);
unsigned int *h_iprepattern = (unsigned int*) h_prepattern;
// mark out ALL multiples of given primes in precomputed pattern
// We could also store this in a table of size 15015 bytes, saving the need for computation.
unsigned int primes[] = {2,3,5,7,11,13};
for (int j=0; j<sizeof(primes)/sizeof(int); j++)
for (unsigned long long i = 0; i <= N; i += primes[j])
h_iprepattern[i/32] |= 1<<(i&31);
// allocate device memory for precomputed pattern and copy pattern to device
unsigned char* d_prepattern = NULL;
cutilSafeCall( cudaMalloc( (void**) &d_prepattern, bitbufsize));
cutilSafeCall( cudaMemcpy( d_prepattern, h_prepattern, bitbufsize, cudaMemcpyHostToDevice ));
// setup execution parameters
dim3 gridsize( 1, 1, 1);
dim3 blocksize( 32*6, 1, 1);
dim3 gridsize2( BLOCKS-1, 1, 1);
dim3 blocksize2( 32*6, 1, 1);
fprintf(stderr, "Cooking kernels...\n");
// cook the first kernel
primesSMKernel<<< gridsize, blocksize, bitbufsize >>>( d_prepattern, d_bitbuffer, bitbufsize, 0 );
cudaThreadSynchronize();
if (BLOCKS > 1)
{
// cook the second kernel
primesSMKernel2<<< gridsize2, blocksize2, bitbufsize >>>( d_prepattern, d_bitbuffer, bitbufsize, 0 );
cudaThreadSynchronize();
}
fprintf(stderr, "Computing on GPU...\n");
cutilCheckError( cutCreateTimer( &timer));
cutilCheckError( cutStartTimer( timer));
// execute the first kernel
primesSMKernel<<< gridsize, blocksize, bitbufsize >>>( d_prepattern, d_bitbuffer, bitbufsize, NUM );
cudaThreadSynchronize();
if (BLOCKS > 1)
{
// copy the precomputed prime bitarray into constant memory (for better caching)
cudaMemcpyToSymbol( d_prebitbuffer, d_bitbuffer, bitbufsize, 0, cudaMemcpyDeviceToDevice );
// execute the second kernel
primesSMKernel2<<< gridsize2, blocksize2, bitbufsize >>>( d_prepattern, d_bitbuffer, bitbufsize, NUM );
cudaThreadSynchronize();
}
// check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed");
cutilCheckError( cutStopTimer( timer));
// check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed");
fprintf(stderr, "Processing time (GPU): %f (ms)\n", cutGetTimerValue( timer));
cutilCheckError( cutDeleteTimer( timer));
// allocate mem for the result bitarray on host side
unsigned char* h_bitbuffer = (unsigned char*) malloc(BLOCKS * bitbufsize);
// copy result bitarray to host side
cutilSafeCall( cudaMemcpy( h_bitbuffer, d_bitbuffer, BLOCKS * bitbufsize, cudaMemcpyDeviceToHost ));
fprintf(stderr, "Computing on CPU...\n");
cutilCheckError( cutCreateTimer( &timer2));
cutilCheckError( cutStartTimer( timer2));
// compute reference solution
computePrimes( bitbuffer_gold, BLOCKS*bitbufsize, BLOCKS*NUM-1 );
cutilCheckError( cutStopTimer( timer2));
fprintf(stderr, "Processing time (CPU): %f (ms)\n", cutGetTimerValue( timer2));
cutilCheckError( cutDeleteTimer( timer2));
// count primes on CPU side
unsigned long long count = 0;
for (unsigned long long i=RANGEMIN; i <= RANGEMAX; i++)
{
if ((bitbuffer_gold[i/8] & (1<<(i&7))) == 0)
count++;
}
fprintf(stderr, "CPU says there are %lld primes from %lld to including %lld.\n", count, RANGEMIN, RANGEMAX);
// count primes on GPU side
count = 0;
unsigned int *h_ibitbuffer = (unsigned int*)h_bitbuffer;
for (unsigned long long i=RANGEMIN; i <= RANGEMAX; i++)
{
if ((h_ibitbuffer[i/32] & (1<<(i&31))) == 0)
count++;
}
fprintf(stderr, "GPU says there are %lld primes from %lld to including %lld.\n", count, RANGEMIN, RANGEMAX);
// compare the results bit by bit and warn on mismatches
for (unsigned long long i=0; i <= BLOCKS*NUM; i++)
{
bool cpu = (bitbuffer_gold[i/8] & (1<<(i&7))) == 0;
bool gpu = (h_ibitbuffer[i/32] & (1<<(i&31))) == 0;
if (cpu != gpu) fprintf(stderr, "WARNING: CPU and GPU disagree on primality of %lld (CPU: %d, GPU: %d)\n", i, cpu, gpu);
}
free(bitbuffer_gold);
free(h_bitbuffer);
cutilSafeCall(cudaFree(d_bitbuffer));
cudaThreadExit();
}
```

“Gold” kernel:

```
**/
* Sieve of Eratosthenes.
* Host Code
*/
#include <math.h>
#include <stdio.h>
////////////////////////////////////////////////////////////////////////////////
// export C interface
extern "C"
void computePrimes( unsigned char* bitbuffer, unsigned long long bitbufsize, unsigned long long N );
////////////////////////////////////////////////////////////////////////////////
//! Compute reference prime set
////////////////////////////////////////////////////////////////////////////////
void
computePrimes( unsigned char* bitbuffer, unsigned long long bitbufsize, unsigned long long N )
{
unsigned long long firstprime = 2;
unsigned long long sqrt_N = ceil(sqrt(N));
bitbuffer[0] |= 3; // 0 and 1 are not primes.
for (;;)
{
// mark out multiples of primes
for (unsigned long long i = firstprime * firstprime; i <= N; i += firstprime)
bitbuffer[i/8] |= 1<<(i&7);
// search for next prime
for (firstprime = firstprime + 1; firstprime < N; firstprime++)
if ((bitbuffer[firstprime/8] & (1<<(firstprime&7))) == 0) break;
if (firstprime > sqrt_N) break;
}
}
```

GPU code, two kernels

```
/* Sieve of Eratosthenes
* Device Code
*/
#ifndef _PRIMES_KERNEL_H_
#define _PRIMES_KERNEL_H_
#include <stdio.h>
#define SDATA( index) cutilBankChecker(sdata, index)
#ifdef __DEVICE_EMULATION__
#define EMUSYNC __syncthreads()
#else
#define EMUSYNC
#endif
#define __register__ volatile
////////////////////////////////////////////////////////////////////////////////
//! Sieve of Eratosthenes on GPU
////////////////////////////////////////////////////////////////////////////////
__global__ void
primesSMKernel( unsigned char *g_prepattern, unsigned char *g_bitbuffer, unsigned int bitbufsize, unsigned int NUM )
{
extern unsigned char __shared__ sdata[];
if (NUM > 0)
{
__register__ unsigned int *isdata = (unsigned int*)&sdata[0];
__register__ unsigned int *g_ibitbuffer = (unsigned int*)g_bitbuffer;
__register__ unsigned int *g_iprepattern = (unsigned int*)g_prepattern;
__register__ unsigned int num = bitbufsize / 4; // round down
__register__ unsigned int remain = bitbufsize % 4; // remainder
__register__ const unsigned int idx = threadIdx.x;
// initialize shared memory with precomputed bit pattern for primes 2, 3, 5, 7, 11, 13
for (__register__ int i=0; i < num; i+= blockDim.x)
if (i+idx < num) isdata[i+idx] = g_iprepattern[i+idx];
if (idx < remain) sdata[4*num+idx] = g_prepattern[4*num+idx];
__syncthreads();
unsigned int __shared__ firstprime;
__register__ unsigned int sqrt_N = ceil(sqrtf((float)NUM));
if (threadIdx.x == 0)
{
firstprime = 17; // start marking multiples of primes beginning with prime 11
sdata[0] = 0x53; // 2 is prime, 3 is prime, 5 is prime, 7 is prime, the rest in this byte isn't
sdata[1] = 0xd7; // 11 is prime, 13 is prime
}
__syncthreads();
while (firstprime <= sqrt_N)
{
// mark out all multiples of "firstprime" starting with firstprime squared.
for (unsigned int i = (firstprime+idx) * firstprime; i < NUM; i += firstprime*blockDim.x)
sdata[i>>3] |= (1<<(i&7));
__syncthreads();
// search for next prime (unmarked number) in the bit array using a single thread
if (threadIdx.x == 0)
for (firstprime = firstprime + 1; firstprime < NUM; firstprime++)
if ((sdata[firstprime>>3] & (1<<(firstprime&7))) == 0) break;
__syncthreads();
}
// coalesced and bank-conflict free 32 bit integer copy from shared to global
for (__register__ int i=0; i < num; i+= blockDim.x)
if (i+idx < num) g_ibitbuffer[i+idx] = isdata[i+idx];
// copy remaining bytes
if (idx < remain) g_bitbuffer[4*num+idx] = sdata[4*num+idx];
}
}
__device__ __constant__ unsigned char d_prebitbuffer[65536];
__global__ void
primesSMKernel2( unsigned char *g_prepattern, unsigned char *g_bitbuffer, unsigned int bitbufsize, unsigned int NUM )
{
extern unsigned char __shared__ sdata[];
if (NUM > 0)
{
__register__ unsigned int *isdata = (unsigned int*)&sdata[0];
__register__ unsigned int *g_ibitbuffer = (unsigned int*)g_bitbuffer;
__register__ unsigned int *g_iprepattern = (unsigned int*)g_prepattern;
__register__ unsigned int num = bitbufsize / 4; // round down
__register__ unsigned int remain = bitbufsize % 4; // remainder
__register__ const unsigned int idx = threadIdx.x;
// initialize shared memory with precomputed bit pattern for primes 2, 3, 5, 7, 11, 13
for (__register__ int i=0; i < num; i+= blockDim.x)
if (i+idx < num) isdata[i+idx] = g_iprepattern[i+idx];
if (idx < remain) sdata[4*num+idx] = g_prepattern[4*num+idx];
// K is the block-specific offset
unsigned long long K = NUM * blockIdx.x + NUM;
__syncthreads();
unsigned int __shared__ firstprime;
__register__ unsigned int sqrt_KN = ceil(sqrtf((float)(K+NUM)));
if (threadIdx.x == 0)
{
firstprime = 17; // start marking multiples of primes beginning with prime 17
}
__syncthreads();
while (firstprime <= sqrt_KN)
{
// compute an offset such that we're instantly entering the range of
// K...K+N in the loop below
// Because 64 bit division is costly, use only the first thread
unsigned int __shared__ offset;
if (threadIdx.x == 0)
{
offset = 0;
if (K >= firstprime*firstprime) offset = (K-firstprime*firstprime) / firstprime;
}
__syncthreads();
// mark out all multiples of "firstprime" that fall into this thread block, starting
// with firstprime squared.
for (unsigned long long i = (offset+firstprime+idx) * firstprime;
i < K+NUM;
i += firstprime*blockDim.x)
if (i >= K) sdata[(i-K)>>3] |= (1<<((i-K)&7));
__syncthreads();
// search for next prime (unmarked number) in the reference bit array using a single thread
if (threadIdx.x == 0)
for (firstprime = firstprime + 1; firstprime < NUM; firstprime++)
if ((d_prebitbuffer[firstprime>>3] & (1<<(firstprime&7))) == 0) break;
__syncthreads();
}
// byte-by-byte uncoalesced and bank-conflict-prone copy from shared to global memory
// TODO: create a generic coalesced copy routine that works with arbitrary
// output byte offsets on compute 1.0 and 1.1 hardware
__register__ unsigned int byteoff = bitbufsize * blockIdx.x + bitbufsize;
for (__register__ int i=0; i < bitbufsize; i+= blockDim.x)
if (i+idx < bitbufsize) g_bitbuffer[byteoff+i+idx] = sdata[i+idx];
}
}
#endif // #ifndef _PRIMES_KERNEL_H_
```

Christian