I’m working on a simple prime sieve right now. The gist of it is for n<=2^32, it sieves all primes upto n using a segboolsieve() (defined in smallsieve.h), and for n>2^32, segboolsieve() returns an array P containing all primes less than sqrt(n).
In the latter case, the host rounds n up to the nearest multiple of 2^32 (for convenience sake), and splits the digits between 2^32 and n into segments of 2^32 digits a piece.
The kernel works on one segment at a time, dividing it into threads sub-segments, and each thread sieves their respective sub-segment. I initially allocated each segment in global memory before sieving it, but that caused kernel launch failures for a reason I have yet to understand. Instead, I just had each thread allocate memory for their respective sub-segment and work on it instead. Each thread is working with a 256kB sub-segment given threads = 1024.
I’m only sieving odd numbers, and for the sake of space-efficiency I’ve mapped each number to a bit in an array of 64-bit integers.
CORE.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "smallsieve.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
typedef unsigned __int64 uint64;
typedef unsigned __int32 uint32;
const uint64 pow2_32 = 4294967296;
const uint32 threads = 1024;
const uint64 segsize = pow2_32 / threads;
///////////////////////////KERNELS START///////////////////////////
//Checks if x is prime; if bit corresponding to x is 0, then return true.
__device__ bool isPrime(uint64 *mark, uint64 x)
{
return (mark[x / 128] & ((uint64)1 << ((x >> 1) & 63))) ? 0 : 1;
}
//Set the bit corresponding to x
__device__ void bitSet(uint64 *mark, uint64 x)
{
mark[x / 128] |= ((uint64)1 << ((x >> 1) & 63));
}
__global__ void SieveBlock(uint32 *P, uint32 completed, uint32 plen)
{
//Each thread sieves [(pow2_32 >> 1) / threads] elements of the current block
uint64 mark[(segsize / 128) + 1];
uint64 id, i, j, minb, min, max, prime, temp1, temp2;
id = threadIdx.x;
min = (completed*pow2_32) + (id*segsize) + 1;
max = min + segsize - 2;
for (i = 0;(P[i] * P[i] <= max) & (i<plen);i++)
{
prime = P[i];
minb = ((min / prime) * prime) + P[i];
if (~minb & 1) minb += prime;
for (j = minb;j <= max;j += (prime << 1))
bitSet(mark,j-(min-1));
}
}
////////////////////////////KERNELS END////////////////////////////
//Driver function
int main(uint32 argc, char* argv[])
{
//Range: Data-type dependent
uint64 n, m;
printf("Enter n: ");
scanf("%llu", &n);
bool smallsieve = false; //Use serial sieve for n<2^32
if (n <= pow2_32)
{
smallsieve = true;
printf("Rounded %llu to ", n);
m = (uint64)sqrt(n);
n = m * m;
printf("%llu\n", n);
}
else if (n % pow2_32 > 0) //If n>2^32 then round n to nearest multiple of 2^32
{
printf("Rounded %llu to ", n);
n = ((n / pow2_32) + 1) * pow2_32;
printf("%llu\n", n);
m = (uint64)(sqrt(n));
}
uint32 plen = 0;
uint32 *P = NULL;
if (~n & 1) n--;
if (~m & 1) m--;
P = segboolsieve(n, m, plen, smallsieve);
if (P == NULL)
{
printf("Memory Allocation Failure!\n");
exit(0);
}
else
printf("Last prime in utility sieve: %u @ index [%u]\n", P[plen - 1], plen - 1);
if (smallsieve)
{
free(P);
return 0;
}
uint32 chunkcount = (uint32)((n + 1) / pow2_32); //No. of chunks
uint32 completed = 1;
printf("\n%u chunk(s) for [%llu->%llu]\n", chunkcount - 1, pow2_32 + 1, n);
uint32 *dP;
//Log execution time
float GPU_TIME = 0.0;
float temp_t;
//CUDA Malloc
cudaMalloc(&dP, (plen + 1) * (size));
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
dim3 TPB(threads, 1, 1);
cudaMemcpy(dP, P, plen * size, cudaMemcpyHostToDevice);
while (completed < chunkcount)
{
cudaEventRecord(start);
SieveBlock << <1, TPB >> > (dP, completed, plen);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&temp_t, start, stop);
GPU_TIME += temp_t;
completed++;
}
free(P);
cudaFree(dP);
GPU_TIME /= 1000;
printf("COMPUTE-PHASE GPU Time: %0.3f seconds\n", GPU_TIME);
return 0;
}
smallsieve.h
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
typedef unsigned __int64 uint64;
typedef unsigned __int32 uint32;
const size_t size = sizeof(uint64);
//Utility function to calculate postive uint64-powers
uint64 power(uint64 val, uint64 exp)
{
uint64 temp = val;
for (uint64 i = 1; i < exp; i++) temp *= val;
return temp;
}
//Utility function to approximate no. of primes between 1->n as n/ln(n)
uint32 trimSize(uint64 n)
{
long double e = 2.7183;
uint64 exp = 1;
while (pow(e, exp) < n)
exp++;
return (uint32)(n / (exp - 2));
}
uint32 *segboolsieve(uint64 n, uint64 m, uint32 &plen, bool smallsieve)
{
uint32 blocksize = (uint32)m >> 1;
if (smallsieve)
plen = trimSize(n);
else
plen = trimSize(m + 2);
//Boolean array initialized to false
bool *mark = (bool *)calloc(blocksize + 1, sizeof(bool)); //Represents [2,3,5,7,9,11,...,sqrt(n)]
//Array to store primes b/w [2,m]
uint32 *P = (uint32 *)calloc(plen, sizeof(uint32) + 1);
if (mark == NULL || P == NULL) { printf("Memory Allocation Failed!\n"); exit(1); }
uint32 i, j, k, offset;
//Log execution time
clock_t START_TIME, END_TIME;
double CPU_TIME1 = 0.0, CPU_TIME2 = 0.0;
//Setup-Phase: Calculate all primes in the range [3,m]
START_TIME = clock();
for (i = 5, k = 1, offset = 2; i <= m; i += offset, offset = 6 - offset) //i->[3,5,7,9...,sqrt(n)] | i corresponds to mark[(i-3)/2]
{
if (!mark[i >> 1])
{
if (i*i <= n)
{
for (j = i * i; j <= m; j += (i << 1)) //j->[i^2,n] | increments by 2*i
mark[j >> 1] = true;
}
P[k++] = i;
}
}
free(mark);
P[0] = 3;
plen = k;
END_TIME = clock();
CPU_TIME1 = ((double)(END_TIME - START_TIME)) / CLOCKS_PER_SEC;
//If CUDA Sieve will be used
if (!smallsieve)
{
printf("\nSETUP-PHASE CPU Time: %0.3f seconds\n", CPU_TIME1);
return P;
}
//Calculate all primes in the range [m,n] in segmented blocks of size (m/2)
//Doubled as a blocksize of X covers 2X digits
uint32 min = (blocksize << 1) + 1;
uint32 max = (blocksize << 2) - 1;
//Blocks contain ranges [blocksize*k, blocksize*(k+1)]
START_TIME = clock();
while (min < n)
{
if (max > n) max = (uint32)n;
mark = (bool *)calloc(blocksize + 1, sizeof(bool));
for (i = 0;i < plen;i++)
{
//Find smallest odd-multiple of P[i] w/i range [min,max]
uint32 minb = (uint32)(min / P[i]) * (P[i]) + P[i];
if (~minb & 1) minb += P[i];
//Mark odd-multiples of P[i] as composite
for (j = minb; j <= max; j += 2 * P[i])
mark[(j - min) >> 1] = true;
}
for (i = min; i <= max;i += 2)
{
if (!mark[(i - min) >> 1])
P[k++] = i;
}
min += (blocksize << 1);
max += (blocksize << 1);
free(mark);
}
plen = k;
END_TIME = clock();
CPU_TIME2 += ((double)(END_TIME - START_TIME)) / CLOCKS_PER_SEC;
printf("\nSETUP-PHASE CPU Time: %0.3f seconds\n", CPU_TIME1 + CPU_TIME2);
return P;
}
Here’s the problem: to set the bit corresponding to a number, I need to perform a bitwise OR. For some reason, writing the result of this operation to the array generates the following report from CUDA-MEMCHECK:-
========= Program hit cudaErrorMemoryAllocation (error 2) due to "out of memory" on CUDA API call to cudaLaunchKernel.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuD3D9UnmapVertexBuffer + 0x2e2c29) [0x2f0feb]
========= Host Frame:C:\Users\games\source\repos\CUDABitsieve\x64\Release\CUDABitsieve.exe (cudaLaunchKernel + 0x1f9) [0x1839]
========= Host Frame:C:\Users\games\source\repos\CUDABitsieve\x64\Release\CUDABitsieve.exe (SieveBlock + 0xda) [0xf3fa]
========= Host Frame:C:\Users\games\source\repos\CUDABitsieve\x64\Release\CUDABitsieve.exe (main + 0x2bb) [0xfadb]
========= Host Frame:C:\Users\games\source\repos\CUDABitsieve\x64\Release\CUDABitsieve.exe (__scrt_common_main_seh + 0x10c) [0x10184]
========= Host Frame:C:\WINDOWS\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x181f4]
========= Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x6a251]
=========
========= ERROR SUMMARY: 1 error
I reiterate, the mere act of writing the result to the array is causing an out-of-memory error. If I comment out the contents of the j-loop in the SieveBlock() kernel, the error disappears. Even substituting its contents with the following raises no errors:-
for (j = minb;j <= max;j += (prime << 1))
{
temp1 = j - (min - 1);
temp2 = mark[temp1 / 128] | 1; //Arbitrary operation involving an element of the array
}//SAFE
And assigning the result of some other operation to it also causes no errors:-
for (j = minb;j <= max;j += (prime << 1))
{
temp = j - (min - 1);
mark[temp / 128] = 5 * 10; //Result of arbitrary operation not involving array's elements written to array
}//SAFE
However writing the value to the array as shown below causes the same out-of-memory error to manifest:-
for (j = minb;j <= max;j += (prime << 1))
{
temp1 = j - (min - 1);
temp2 = mark[temp1 / 128] | 1; //Arbitrary operation involving an element of the array
mark[temp1 / 128] = temp2; //Writing result back to array
}//ERROR
And simply printing an element of the array also causes the same error:-
for (j = minb;j <= max;j += (prime << 1))
{
temp = j - (min - 1); //Note: j = 1,3,5,7,9,11,...so it's 100% within bounds
printf("%llu\n", mark[1]); //WORKS
printf("%llu\n", mark[163/128); //WORKS
printf("%llu\n", mark[prime/128); //WORKS
printf("%llu\n", mark[temp / 128]); //ERROR
}
As a relative newbie, I have no idea what exactly I’m not seeing that’s causing this problem.The logic is sound on paper, and my serial implementation works perfectly. So there must be something about CUDA that I don’t know that’s pertinent here.
Any suggestions and advice are very much appreciated. Cheers.
OS: Windows 10
IDE: VS2017 Community
CUDA: v10.0
CPU: i7 4790K
GPU: GTX 1070 Ti