[code review] my 1st CUDA program - need feedback - Sieve of Eratosthenes

hey guys, so I’m learning CUDA and wrote a little program which generates prime numbers using the Sieve of Eratosthenes. (I know the limitations of CUDA, specially with memory sizes and limits, but this program is for educational purposes).

So my questions are:

  1. Did I set up the configuration correctly? (did I set dimBlock and dimGrid correctly?)
  2. In my kernel I seem to set a "maxRoot" constant variable for each thread spawned. Is there a way to set that once in the GPU and then have that shared across all threads??
  3. In the given code below, on line 23 - there's optimization #2 - sieve only odds. How can I apply that to my kernel? The way I'm currently doing is spawning threads from 1 to sqrt(max) and assigning each of them (so much like looping, but incrementing i by 1 each time. In that line of code, we see that it starts at 3 and increments by 2 in the for loop.)
  4. Can you guys give me any other feedback on my code? What else am I doing wrong, or what else could be improved?

This is the HOST code for the sieve I attempted to implement in CUDA:

void sieveOfEratosthenes(uint64_t max)
{
   // There are no prime numbers smaller than 2                                 
   if (max >= 2)
   {
      max++;
      // Create an array of size n and initialize all elements as 0             
      char arr[max]; // could call calloc() and return a pointer                
      memset(arr, 0, sizeof(char) * max);
      arr[0] = 1;
      arr[1] = 1;

      uint64_t maxRoot = sqrt(max); // optimization #1                          
      uint64_t i;

      int j; // sieve multiples of two                                    
      for (j = 2 * 2; j < max; j += 2)
      {
         arr[j] = 1;
      }

      /* for (i = 2; i <= maxRoot; i++)  */
      for (i = 3; i <= maxRoot; i += 2) // optimization #2 - sieve only odds    
      {
         if (arr[i] == 0 )
         {
            int j;
            for (j = i * i; j < max; j += i)
            {
               arr[j] = 1;
            }

         }
      }

      // display                                                                
      for (i = 0; i < max; i++)
         if (arr[i] == 0)
            printf("%d ", i);
      printf("\n");
   }
}

This is my actual cuda file: prime.cu

#include <stdio.h>
#include <helper_cuda.h> // checkCudaErrors()
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>

typedef unsigned long long int uint64_t;

/******************************************************************************
* kernel for finding prime numbers using the sieve of eratosthenes
* - primes: an array of bools. initially all numbers are set to "0".
*           A "0" value means that the number at that index is prime.
* - max: the max size of the primes array
******************************************************************************/
__global__ static void sieveOfEratosthenesCUDA(char *primes, uint64_t max)
{
   // first thread 0
   if (threadIdx.x == 0 && threadIdx.y == 0)
   {
      primes[0] = 1; // value of 1 means the number is NOT prime
      primes[1] = 1; // numbers "0" and "1" are not prime numbers

      // sieve multiples of two
      for (int j = 2 * 2; j < max; j += 2)
      {
         primes[j] = 1;
      }
   }
   else
   {
      int index = blockIdx.x * blockDim.x + threadIdx.x;
      const uint64_t maxRoot = sqrt((double)max);

      // make sure index won't go out of bounds, also don't execute it
      // on index 1
      if (index < maxRoot && primes[index] == 0 && index > 1 )
      {
         // mark off the composite numbers
         for (int j = index * index; j < max; j += index)
         {
            primes[j] = 1;
         }
      }
   }
}

/******************************************************************************
* checkDevice()
******************************************************************************/
__host__ int checkDevice()
{
   // query the Device and decide on the block size
   int devID = 0; // the default device ID
   cudaError_t error;
   cudaDeviceProp deviceProp;
   error = cudaGetDevice(&devID);
   if (error != cudaSuccess)
   {
      printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
      exit(EXIT_FAILURE);
   }

   error = cudaGetDeviceProperties(&deviceProp, devID);
   if (deviceProp.computeMode == cudaComputeModeProhibited || error != cudaSuccess)
   {
      printf("CUDA device ComputeMode is prohibited or failed to getDeviceProperties\n");
      return EXIT_FAILURE;
   }

   // Use a larger block size for Fermi and above (see compute capability)
   return (deviceProp.major < 2) ? 16 : 32;
}

/******************************************************************************
* genPrimesOnDevice
* - inputs: limit - the largest prime that should be computed
*           primes - an array of size [limit], initialized to 0
******************************************************************************/
__host__ void genPrimesOnDevice(char* primes, uint64_t max)
{
   int blockSize = checkDevice();
   if (blockSize == EXIT_FAILURE)
      return;

   char* d_Primes = NULL;
   int sizePrimes = sizeof(char) * max;
   uint64_t maxRoot = sqrt(max);

   // allocate the primes on the device and set them to 0
   checkCudaErrors(cudaMalloc(&d_Primes, sizePrimes));
   checkCudaErrors(cudaMemset(d_Primes, 0, sizePrimes));

   // make sure that there are no errors...
   checkCudaErrors(cudaPeekAtLastError());

   // setup the execution configuration
   dim3 dimBlock(maxRoot, 1, 1);
   dim3 dimGrid(1);

   //////// debug
   #ifdef DEBUG
   printf("dimBlock(%d, %d, %d)\n", dimBlock.x, dimBlock.y, dimBlock.z);
   printf("dimGrid(%d, %d, %d)\n", dimGrid.x, dimGrid.y, dimGrid.z);
   #endif

   // call the kernel
   sieveOfEratosthenesCUDA<<<dimGrid, dimBlock>>>(d_Primes, max);

   // check for kernel errors
   checkCudaErrors(cudaPeekAtLastError());
   checkCudaErrors(cudaDeviceSynchronize());

   // copy the results back
   checkCudaErrors(cudaMemcpy(primes, d_Primes, sizePrimes, cudaMemcpyDeviceToHost));

   // no memory leaks
   checkCudaErrors(cudaFree(d_Primes));
}

/******************************************************************************
*
******************************************************************************/
int main()
{
   uint64_t maxPrime = 102; // find all primes from 0-101
   char* primes = (char*) malloc(maxPrime);
   memset(primes, 0, maxPrime); // initialize all elements to 0

   genPrimesOnDevice(primes, maxPrime);

   // display the results
   int i;
   for (i = 0; i < maxPrime; i++)
      if (primes[i] == 0)
         printf("%i ", i);
   printf("\n");

   free(primes);

   return 0;
}

Any kind of feedback, improvements is good. Thanks.

__shared__ uint64_t maxRoot;
if (threadIdx.x == 0 && threadIdx.y == 0) maxRoot = sqrt(max); // optimization #1
__syncthreads();
// maxRoot is now safe to be read from all threads

considering how fast transcendental functions are computed on the GPU, you will barely see
an improvement from the above optimization. Computing this value in the host code and passing it to the kernel as a launch argument is also an adequate optimization.

I noticed an inefficiency in your posted code.

if (threadIdx.x == 0 && threadIdx.y == 0)
{
   ...
    // sieve multiples of two
   ...
}
else
{
   ...
   // mark off the composite numbers
   ...
}

don’t do that. It will make your first warp heavily divergent. Warp divergence is bad.

@cbuchner1, I really appretiate your inputs! You see, as I am a beginner, I didn’t even know about Warp Divergence.
So, I’ve modified my code and although I didn’t benchmark it, I think it’s a little bit more efficient than before.

I’ve fixed the warp you mentioned. These are the modifications to the code:

  • split the kernel I had before into 3 kernels. One for initialization, one for sieving the even numbers, one for the rest.
  • I’m now passing the root to the kernel instead of having the GPU compute it.
  • I’ve adjusted the dimGrid and dimBlocks for the kernels accordingly

Here’s the updated prime.cu

#include <stdio.h>
#include <helper_cuda.h> // checkCudaErrors()
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>

typedef unsigned long long int uint64_t;

/******************************************************************************
* kernel that initializes the 1st couple of values in the primes array.
******************************************************************************/
__global__ static void sieveInitCUDA(char* primes)
{
   primes[0] = 1; // value of 1 means the number is NOT prime
   primes[1] = 1; // numbers "0" and "1" are not prime numbers
}

/******************************************************************************
* kernel for sieving the even numbers starting at 4.
******************************************************************************/
__global__ static void sieveEvenNumbersCUDA(char* primes, uint64_t max)
{
   uint64_t index = blockIdx.x * blockDim.x + threadIdx.x + threadIdx.x + 4;
   if (index < max)
      primes[index] = 1;
}

/******************************************************************************
* kernel for finding prime numbers using the sieve of eratosthenes
* - primes: an array of bools. initially all numbers are set to "0".
*           A "0" value means that the number at that index is prime.
* - max: the max size of the primes array
* - maxRoot: the sqrt of max (the other input). we don't wanna make all threads
*   compute this over and over again, so it's being passed in
******************************************************************************/
__global__ static void sieveOfEratosthenesCUDA(char *primes, uint64_t max,
                                               const uint64_t maxRoot)
{
   // get the starting index, sieve only odds starting at 3
   // 3,5,7,9,11,13...
   /* int index = blockIdx.x * blockDim.x + threadIdx.x + threadIdx.x + 3; */

   // apparently the following indexing usage is faster than the one above. Hmm
   int index = blockIdx.x * blockDim.x + threadIdx.x + 3;

   // make sure index won't go out of bounds, also don't start the execution
   // on numbers that are already composite
   if (index < maxRoot && primes[index] == 0)
   {
      // mark off the composite numbers
      for (int j = index * index; j < max; j += index)
      {
         primes[j] = 1;
      }
   }
}

/******************************************************************************
* checkDevice()
******************************************************************************/
__host__ int checkDevice()
{
   // query the Device and decide on the block size
   int devID = 0; // the default device ID
   cudaError_t error;
   cudaDeviceProp deviceProp;
   error = cudaGetDevice(&devID);
   if (error != cudaSuccess)
   {
      printf("CUDA Device not ready or not supported\n");
      printf("%s: cudaGetDevice returned error code %d, line(%d)\n", __FILE__, error, __LINE__);
      exit(EXIT_FAILURE);
   }

   error = cudaGetDeviceProperties(&deviceProp, devID);
   if (deviceProp.computeMode == cudaComputeModeProhibited || error != cudaSuccess)
   {
      printf("CUDA device ComputeMode is prohibited or failed to getDeviceProperties\n");
      return EXIT_FAILURE;
   }

   // Use a larger block size for Fermi and above (see compute capability)
   return (deviceProp.major < 2) ? 16 : 32;
}

/******************************************************************************
* genPrimesOnDevice
* - inputs: limit - the largest prime that should be computed
*           primes - an array of size [limit], initialized to 0
******************************************************************************/
__host__ void genPrimesOnDevice(char* primes, uint64_t max)
{
   int blockSize = checkDevice();
   if (blockSize == EXIT_FAILURE)
      return;

   char* d_Primes = NULL;
   int sizePrimes = sizeof(char) * max;
   uint64_t maxRoot = sqrt(max);

   // allocate the primes on the device and set them to 0
   checkCudaErrors(cudaMalloc(&d_Primes, sizePrimes));
   checkCudaErrors(cudaMemset(d_Primes, 0, sizePrimes));

   // make sure that there are no errors...
   checkCudaErrors(cudaPeekAtLastError());

   // setup the execution configuration
   dim3 dimBlock(blockSize);
   dim3 dimGrid((maxRoot + dimBlock.x) / dimBlock.x);
   dim3 dimGridEvens(((max + dimBlock.x) / dimBlock.x) / 2);

   //////// debug
   #ifdef DEBUG
   printf("dimBlock(%d, %d, %d)\n", dimBlock.x, dimBlock.y, dimBlock.z);
   printf("dimGrid(%d, %d, %d)\n", dimGrid.x, dimGrid.y, dimGrid.z);
   #endif

   // call the kernel
   // NOTE: no need to synchronize after each kernel
   // http://stackoverflow.com/a/11889641/2261947
   sieveInitCUDA<<<1, 1>>>(d_Primes); // launch a single thread to initialize
   sieveEvenNumbersCUDA<<<dimGridEvens, dimBlock>>>(d_Primes, max);
   sieveOfEratosthenesCUDA<<<dimGrid, dimBlock>>>(d_Primes, max, maxRoot);

   // check for kernel errors
   checkCudaErrors(cudaPeekAtLastError());
   checkCudaErrors(cudaDeviceSynchronize());

   // copy the results back
   checkCudaErrors(cudaMemcpy(primes, d_Primes, sizePrimes, cudaMemcpyDeviceToHost));

   // no memory leaks
   checkCudaErrors(cudaFree(d_Primes));
}