hey guys, so I’m learning CUDA and wrote a little program which generates prime numbers using the Sieve of Eratosthenes. (I know the limitations of CUDA, specially with memory sizes and limits, but this program is for educational purposes).
So my questions are:
- Did I set up the configuration correctly? (did I set dimBlock and dimGrid correctly?)
- In my kernel I seem to set a "maxRoot" constant variable for each thread spawned. Is there a way to set that once in the GPU and then have that shared across all threads??
- In the given code below, on line 23 - there's optimization #2 - sieve only odds. How can I apply that to my kernel? The way I'm currently doing is spawning threads from 1 to sqrt(max) and assigning each of them (so much like looping, but incrementing i by 1 each time. In that line of code, we see that it starts at 3 and increments by 2 in the for loop.)
- Can you guys give me any other feedback on my code? What else am I doing wrong, or what else could be improved?
This is the HOST code for the sieve I attempted to implement in CUDA:
void sieveOfEratosthenes(uint64_t max)
{
// There are no prime numbers smaller than 2
if (max >= 2)
{
max++;
// Create an array of size n and initialize all elements as 0
char arr[max]; // could call calloc() and return a pointer
memset(arr, 0, sizeof(char) * max);
arr[0] = 1;
arr[1] = 1;
uint64_t maxRoot = sqrt(max); // optimization #1
uint64_t i;
int j; // sieve multiples of two
for (j = 2 * 2; j < max; j += 2)
{
arr[j] = 1;
}
/* for (i = 2; i <= maxRoot; i++) */
for (i = 3; i <= maxRoot; i += 2) // optimization #2 - sieve only odds
{
if (arr[i] == 0 )
{
int j;
for (j = i * i; j < max; j += i)
{
arr[j] = 1;
}
}
}
// display
for (i = 0; i < max; i++)
if (arr[i] == 0)
printf("%d ", i);
printf("\n");
}
}
This is my actual cuda file: prime.cu
#include <stdio.h>
#include <helper_cuda.h> // checkCudaErrors()
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>
typedef unsigned long long int uint64_t;
/******************************************************************************
* kernel for finding prime numbers using the sieve of eratosthenes
* - primes: an array of bools. initially all numbers are set to "0".
* A "0" value means that the number at that index is prime.
* - max: the max size of the primes array
******************************************************************************/
__global__ static void sieveOfEratosthenesCUDA(char *primes, uint64_t max)
{
// first thread 0
if (threadIdx.x == 0 && threadIdx.y == 0)
{
primes[0] = 1; // value of 1 means the number is NOT prime
primes[1] = 1; // numbers "0" and "1" are not prime numbers
// sieve multiples of two
for (int j = 2 * 2; j < max; j += 2)
{
primes[j] = 1;
}
}
else
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
const uint64_t maxRoot = sqrt((double)max);
// make sure index won't go out of bounds, also don't execute it
// on index 1
if (index < maxRoot && primes[index] == 0 && index > 1 )
{
// mark off the composite numbers
for (int j = index * index; j < max; j += index)
{
primes[j] = 1;
}
}
}
}
/******************************************************************************
* checkDevice()
******************************************************************************/
__host__ int checkDevice()
{
// query the Device and decide on the block size
int devID = 0; // the default device ID
cudaError_t error;
cudaDeviceProp deviceProp;
error = cudaGetDevice(&devID);
if (error != cudaSuccess)
{
printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaGetDeviceProperties(&deviceProp, devID);
if (deviceProp.computeMode == cudaComputeModeProhibited || error != cudaSuccess)
{
printf("CUDA device ComputeMode is prohibited or failed to getDeviceProperties\n");
return EXIT_FAILURE;
}
// Use a larger block size for Fermi and above (see compute capability)
return (deviceProp.major < 2) ? 16 : 32;
}
/******************************************************************************
* genPrimesOnDevice
* - inputs: limit - the largest prime that should be computed
* primes - an array of size [limit], initialized to 0
******************************************************************************/
__host__ void genPrimesOnDevice(char* primes, uint64_t max)
{
int blockSize = checkDevice();
if (blockSize == EXIT_FAILURE)
return;
char* d_Primes = NULL;
int sizePrimes = sizeof(char) * max;
uint64_t maxRoot = sqrt(max);
// allocate the primes on the device and set them to 0
checkCudaErrors(cudaMalloc(&d_Primes, sizePrimes));
checkCudaErrors(cudaMemset(d_Primes, 0, sizePrimes));
// make sure that there are no errors...
checkCudaErrors(cudaPeekAtLastError());
// setup the execution configuration
dim3 dimBlock(maxRoot, 1, 1);
dim3 dimGrid(1);
//////// debug
#ifdef DEBUG
printf("dimBlock(%d, %d, %d)\n", dimBlock.x, dimBlock.y, dimBlock.z);
printf("dimGrid(%d, %d, %d)\n", dimGrid.x, dimGrid.y, dimGrid.z);
#endif
// call the kernel
sieveOfEratosthenesCUDA<<<dimGrid, dimBlock>>>(d_Primes, max);
// check for kernel errors
checkCudaErrors(cudaPeekAtLastError());
checkCudaErrors(cudaDeviceSynchronize());
// copy the results back
checkCudaErrors(cudaMemcpy(primes, d_Primes, sizePrimes, cudaMemcpyDeviceToHost));
// no memory leaks
checkCudaErrors(cudaFree(d_Primes));
}
/******************************************************************************
*
******************************************************************************/
int main()
{
uint64_t maxPrime = 102; // find all primes from 0-101
char* primes = (char*) malloc(maxPrime);
memset(primes, 0, maxPrime); // initialize all elements to 0
genPrimesOnDevice(primes, maxPrime);
// display the results
int i;
for (i = 0; i < maxPrime; i++)
if (primes[i] == 0)
printf("%i ", i);
printf("\n");
free(primes);
return 0;
}
Any kind of feedback, improvements is good. Thanks.