[code review] my 1st CUDA program - need feedback - Sieve of Eratosthenes

unglued · July 11, 2014, 12:25am

hey guys, so I’m learning CUDA and wrote a little program which generates prime numbers using the Sieve of Eratosthenes. (I know the limitations of CUDA, specially with memory sizes and limits, but this program is for educational purposes).

So my questions are:

Did I set up the configuration correctly? (did I set dimBlock and dimGrid correctly?)
In my kernel I seem to set a "maxRoot" constant variable for each thread spawned. Is there a way to set that once in the GPU and then have that shared across all threads??
In the given code below, on line 23 - there's optimization #2 - sieve only odds. How can I apply that to my kernel? The way I'm currently doing is spawning threads from 1 to sqrt(max) and assigning each of them (so much like looping, but incrementing i by 1 each time. In that line of code, we see that it starts at 3 and increments by 2 in the for loop.)
Can you guys give me any other feedback on my code? What else am I doing wrong, or what else could be improved?

This is the HOST code for the sieve I attempted to implement in CUDA:

void sieveOfEratosthenes(uint64_t max)
{
   // There are no prime numbers smaller than 2                                 
   if (max >= 2)
   {
      max++;
      // Create an array of size n and initialize all elements as 0             
      char arr[max]; // could call calloc() and return a pointer                
      memset(arr, 0, sizeof(char) * max);
      arr[0] = 1;
      arr[1] = 1;

      uint64_t maxRoot = sqrt(max); // optimization #1                          
      uint64_t i;

      int j; // sieve multiples of two                                    
      for (j = 2 * 2; j < max; j += 2)
      {
         arr[j] = 1;
      }

      /* for (i = 2; i <= maxRoot; i++)  */
      for (i = 3; i <= maxRoot; i += 2) // optimization #2 - sieve only odds    
      {
         if (arr[i] == 0 )
         {
            int j;
            for (j = i * i; j < max; j += i)
            {
               arr[j] = 1;
            }

         }
      }

      // display                                                                
      for (i = 0; i < max; i++)
         if (arr[i] == 0)
            printf("%d ", i);
      printf("\n");
   }
}

This is my actual cuda file: prime.cu

#include <stdio.h>
#include <helper_cuda.h> // checkCudaErrors()
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>

typedef unsigned long long int uint64_t;

/******************************************************************************
* kernel for finding prime numbers using the sieve of eratosthenes
* - primes: an array of bools. initially all numbers are set to "0".
*           A "0" value means that the number at that index is prime.
* - max: the max size of the primes array
******************************************************************************/
__global__ static void sieveOfEratosthenesCUDA(char *primes, uint64_t max)
{
   // first thread 0
   if (threadIdx.x == 0 && threadIdx.y == 0)
   {
      primes[0] = 1; // value of 1 means the number is NOT prime
      primes[1] = 1; // numbers "0" and "1" are not prime numbers

      // sieve multiples of two
      for (int j = 2 * 2; j < max; j += 2)
      {
         primes[j] = 1;
      }
   }
   else
   {
      int index = blockIdx.x * blockDim.x + threadIdx.x;
      const uint64_t maxRoot = sqrt((double)max);

      // make sure index won't go out of bounds, also don't execute it
      // on index 1
      if (index < maxRoot && primes[index] == 0 && index > 1 )
      {
         // mark off the composite numbers
         for (int j = index * index; j < max; j += index)
         {
            primes[j] = 1;
         }
      }
   }
}

/******************************************************************************
* checkDevice()
******************************************************************************/
__host__ int checkDevice()
{
   // query the Device and decide on the block size
   int devID = 0; // the default device ID
   cudaError_t error;
   cudaDeviceProp deviceProp;
   error = cudaGetDevice(&devID);
   if (error != cudaSuccess)
   {
      printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
      exit(EXIT_FAILURE);
   }

   error = cudaGetDeviceProperties(&deviceProp, devID);
   if (deviceProp.computeMode == cudaComputeModeProhibited || error != cudaSuccess)
   {
      printf("CUDA device ComputeMode is prohibited or failed to getDeviceProperties\n");
      return EXIT_FAILURE;
   }

   // Use a larger block size for Fermi and above (see compute capability)
   return (deviceProp.major < 2) ? 16 : 32;
}

/******************************************************************************
* genPrimesOnDevice
* - inputs: limit - the largest prime that should be computed
*           primes - an array of size [limit], initialized to 0
******************************************************************************/
__host__ void genPrimesOnDevice(char* primes, uint64_t max)
{
   int blockSize = checkDevice();
   if (blockSize == EXIT_FAILURE)
      return;

   char* d_Primes = NULL;
   int sizePrimes = sizeof(char) * max;
   uint64_t maxRoot = sqrt(max);

   // allocate the primes on the device and set them to 0
   checkCudaErrors(cudaMalloc(&d_Primes, sizePrimes));
   checkCudaErrors(cudaMemset(d_Primes, 0, sizePrimes));

   // make sure that there are no errors...
   checkCudaErrors(cudaPeekAtLastError());

   // setup the execution configuration
   dim3 dimBlock(maxRoot, 1, 1);
   dim3 dimGrid(1);

   //////// debug
   #ifdef DEBUG
   printf("dimBlock(%d, %d, %d)\n", dimBlock.x, dimBlock.y, dimBlock.z);
   printf("dimGrid(%d, %d, %d)\n", dimGrid.x, dimGrid.y, dimGrid.z);
   #endif

   // call the kernel
   sieveOfEratosthenesCUDA<<<dimGrid, dimBlock>>>(d_Primes, max);

   // check for kernel errors
   checkCudaErrors(cudaPeekAtLastError());
   checkCudaErrors(cudaDeviceSynchronize());

   // copy the results back
   checkCudaErrors(cudaMemcpy(primes, d_Primes, sizePrimes, cudaMemcpyDeviceToHost));

   // no memory leaks
   checkCudaErrors(cudaFree(d_Primes));
}

/******************************************************************************
*
******************************************************************************/
int main()
{
   uint64_t maxPrime = 102; // find all primes from 0-101
   char* primes = (char*) malloc(maxPrime);
   memset(primes, 0, maxPrime); // initialize all elements to 0

   genPrimesOnDevice(primes, maxPrime);

   // display the results
   int i;
   for (i = 0; i < maxPrime; i++)
      if (primes[i] == 0)
         printf("%i ", i);
   printf("\n");

   free(primes);

   return 0;
}

Any kind of feedback, improvements is good. Thanks.

cbuchner1 · July 14, 2014, 10:55am

__shared__ uint64_t maxRoot;
if (threadIdx.x == 0 && threadIdx.y == 0) maxRoot = sqrt(max); // optimization #1
__syncthreads();
// maxRoot is now safe to be read from all threads

considering how fast transcendental functions are computed on the GPU, you will barely see
an improvement from the above optimization. Computing this value in the host code and passing it to the kernel as a launch argument is also an adequate optimization.

I noticed an inefficiency in your posted code.

if (threadIdx.x == 0 && threadIdx.y == 0)
{
   ...
    // sieve multiples of two
   ...
}
else
{
   ...
   // mark off the composite numbers
   ...
}

don’t do that. It will make your first warp heavily divergent. Warp divergence is bad.

unglued · July 17, 2014, 12:46am

@cbuchner1, I really appretiate your inputs! You see, as I am a beginner, I didn’t even know about Warp Divergence.
So, I’ve modified my code and although I didn’t benchmark it, I think it’s a little bit more efficient than before.

I’ve fixed the warp you mentioned. These are the modifications to the code:

split the kernel I had before into 3 kernels. One for initialization, one for sieving the even numbers, one for the rest.
I’m now passing the root to the kernel instead of having the GPU compute it.
I’ve adjusted the dimGrid and dimBlocks for the kernels accordingly

Here’s the updated prime.cu

#include <stdio.h>
#include <helper_cuda.h> // checkCudaErrors()
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>

typedef unsigned long long int uint64_t;

/******************************************************************************
* kernel that initializes the 1st couple of values in the primes array.
******************************************************************************/
__global__ static void sieveInitCUDA(char* primes)
{
   primes[0] = 1; // value of 1 means the number is NOT prime
   primes[1] = 1; // numbers "0" and "1" are not prime numbers
}

/******************************************************************************
* kernel for sieving the even numbers starting at 4.
******************************************************************************/
__global__ static void sieveEvenNumbersCUDA(char* primes, uint64_t max)
{
   uint64_t index = blockIdx.x * blockDim.x + threadIdx.x + threadIdx.x + 4;
   if (index < max)
      primes[index] = 1;
}

/******************************************************************************
* kernel for finding prime numbers using the sieve of eratosthenes
* - primes: an array of bools. initially all numbers are set to "0".
*           A "0" value means that the number at that index is prime.
* - max: the max size of the primes array
* - maxRoot: the sqrt of max (the other input). we don't wanna make all threads
*   compute this over and over again, so it's being passed in
******************************************************************************/
__global__ static void sieveOfEratosthenesCUDA(char *primes, uint64_t max,
                                               const uint64_t maxRoot)
{
   // get the starting index, sieve only odds starting at 3
   // 3,5,7,9,11,13...
   /* int index = blockIdx.x * blockDim.x + threadIdx.x + threadIdx.x + 3; */

   // apparently the following indexing usage is faster than the one above. Hmm
   int index = blockIdx.x * blockDim.x + threadIdx.x + 3;

   // make sure index won't go out of bounds, also don't start the execution
   // on numbers that are already composite
   if (index < maxRoot && primes[index] == 0)
   {
      // mark off the composite numbers
      for (int j = index * index; j < max; j += index)
      {
         primes[j] = 1;
      }
   }
}

/******************************************************************************
* checkDevice()
******************************************************************************/
__host__ int checkDevice()
{
   // query the Device and decide on the block size
   int devID = 0; // the default device ID
   cudaError_t error;
   cudaDeviceProp deviceProp;
   error = cudaGetDevice(&devID);
   if (error != cudaSuccess)
   {
      printf("CUDA Device not ready or not supported\n");
      printf("%s: cudaGetDevice returned error code %d, line(%d)\n", __FILE__, error, __LINE__);
      exit(EXIT_FAILURE);
   }

   error = cudaGetDeviceProperties(&deviceProp, devID);
   if (deviceProp.computeMode == cudaComputeModeProhibited || error != cudaSuccess)
   {
      printf("CUDA device ComputeMode is prohibited or failed to getDeviceProperties\n");
      return EXIT_FAILURE;
   }

   // Use a larger block size for Fermi and above (see compute capability)
   return (deviceProp.major < 2) ? 16 : 32;
}

/******************************************************************************
* genPrimesOnDevice
* - inputs: limit - the largest prime that should be computed
*           primes - an array of size [limit], initialized to 0
******************************************************************************/
__host__ void genPrimesOnDevice(char* primes, uint64_t max)
{
   int blockSize = checkDevice();
   if (blockSize == EXIT_FAILURE)
      return;

   char* d_Primes = NULL;
   int sizePrimes = sizeof(char) * max;
   uint64_t maxRoot = sqrt(max);

   // allocate the primes on the device and set them to 0
   checkCudaErrors(cudaMalloc(&d_Primes, sizePrimes));
   checkCudaErrors(cudaMemset(d_Primes, 0, sizePrimes));

   // make sure that there are no errors...
   checkCudaErrors(cudaPeekAtLastError());

   // setup the execution configuration
   dim3 dimBlock(blockSize);
   dim3 dimGrid((maxRoot + dimBlock.x) / dimBlock.x);
   dim3 dimGridEvens(((max + dimBlock.x) / dimBlock.x) / 2);

   //////// debug
   #ifdef DEBUG
   printf("dimBlock(%d, %d, %d)\n", dimBlock.x, dimBlock.y, dimBlock.z);
   printf("dimGrid(%d, %d, %d)\n", dimGrid.x, dimGrid.y, dimGrid.z);
   #endif

   // call the kernel
   // NOTE: no need to synchronize after each kernel
   // http://stackoverflow.com/a/11889641/2261947
   sieveInitCUDA<<<1, 1>>>(d_Primes); // launch a single thread to initialize
   sieveEvenNumbersCUDA<<<dimGridEvens, dimBlock>>>(d_Primes, max);
   sieveOfEratosthenesCUDA<<<dimGrid, dimBlock>>>(d_Primes, max, maxRoot);

   // check for kernel errors
   checkCudaErrors(cudaPeekAtLastError());
   checkCudaErrors(cudaDeviceSynchronize());

   // copy the results back
   checkCudaErrors(cudaMemcpy(primes, d_Primes, sizePrimes, cudaMemcpyDeviceToHost));

   // no memory leaks
   checkCudaErrors(cudaFree(d_Primes));
}

Topic		Replies	Views
Sieve of Erastothenes CUDA Programming and Performance	28	50068	July 2, 2012
[solved] Why can't I compute primes past 85 million? (sieve of eratosthenes) CUDA Programming and Performance	2	975	July 17, 2014
Number of items that can be processed in CUDA CUDA Programming and Performance cuda , kernel	5	327	February 17, 2024
Cuda code performance CUDA Programming and Performance	14	3145	December 16, 2014
Memory checker reports unaligned 8-byte access at 0x209820590 CUDA Programming and Performance	3	492	April 5, 2017
The Problem with Primes CUDA Programming and Performance	15	6552	October 10, 2010
Issue with Nested for-loops in kernel CUDA Programming and Performance	13	2333	April 4, 2019
Simple/1st CUDA program: Reverse bits in byte Why is it faster on the CPU? CUDA Programming and Performance	11	7144	December 6, 2007
CUDA Dynamic Parallelism, the code doesn't run as expected CUDA Programming and Performance	18	365	July 20, 2024
Urgent help with threads please! CUDA Programming and Performance	21	10787	March 6, 2008

[code review] my 1st CUDA program - need feedback - Sieve of Eratosthenes

Related topics