Number of items that can be processed in CUDA

Hi,
I am trying out CUDA programming and so far it has been a fun experience. Once of my favorite things to do is to try and calculate how many prime numbers are there for a given range.
I have done this with multithreading and now I am learning how to do it with CUDA.
But I have an issue with the program I wrote using CUDA. It works up to a given range 10,000,000 but when i hit 16,800,000 the number of prime number it finds are fixed at 1077871 no matter how much higher range I go.

I went up to even 300,000,000 which I know has 16,252,325 prime numbers using my multithreaded program.

All the functions I have tested independently and it seems to work without issue so i think maybe the execution configuration is the issue as I am using the “per thread per element” paradigm instead of the " Grid-Stride Loops" way - I want to try it out with the former first.

I am using the execution configuration at the moment

gridsize =390625
blocksize = 768

CUDA_CHECK_RETURN(cudaMalloc((void**)&d_PrimeCount, gridSize * sizeof(int)));
CUDA_CHECK_RETURN(cudaMalloc((void**)&d_PrimeStorage, N * sizeof(long long)));
kernel << <gridSize, blockSize >> > (N, d_PrimeCount, d_PrimeStorage);

which should cover the needed 300 million elements. I am passing in a CudaMalloc parameter of 300 million * sizeof(long long). I am storing the value found in one big array.

I populate the d_PrimeStorage before I passed it to the kernel function


StartValue=1
N=300000000
		PopulateStorage << <gridSize, blockSize >> > (d_PrimeStorage, StartValue, N);

__global__ void PopulateStorage( long long* Storage,  long long StartValue,  long long NumberOfItems)
{
	int idx = blockIdx.x * blockDim.x + threadIdx.x;

	if (idx < NumberOfItems)
	{
		Storage[idx] = StartValue + idx;
	}	
	__syncthreads();
}

__global__ void kernel(int NumberOfItems,  int* PrimeCountFound,  long long* PrimeStorage) 
{
	__shared__ unsigned int count;

	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	int localID = threadIdx.x;
	bool IsValuePrime = false;

	if (localID == 0)
		count = 0;
	__syncthreads();

	if (idx < NumberOfItems)
	{
		IsValuePrime = isPrime(PrimeStorage[idx]);
	}

	if (IsValuePrime)
	{
		coalesced_group active = coalesced_threads();
		if (active.thread_rank() == 0)
			atomicAdd(&count, active.size());
	}
	else
	{
		PrimeStorage[idx] = 0;
	}

	__syncthreads();

	if (localID == 0)
		PrimeCountFound[blockIdx.x] = count;
}

__device__ bool isPrime(long long number)
{
	if (number < 2)
		return false;

	if (number == 2 || number == 3)
		return true;

	//This was missing initially. 
	if (number % 2 == 0 || number % 3 == 0)
		return false;

	if (std::fmod((float)number, (float)2) == 0.0)
		return false;

	long long sqrtNumber = (long long)sqrt((float)number);
	long long dx = 4;

	for (long long i = 5; i <= sqrtNumber; i += dx) {
		if (std::fmod((float)number, (float)i) == 0.0)
			return false;
		dx = -(dx - 6);
	}

	return true;
}

The specs for my GPU are


 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "NVIDIA GeForce RTX 3070 Ti Laptop GPU"
  CUDA Driver Version / Runtime Version          12.3 / 12.3
  CUDA Capability Major/Minor version number:    8.6
  Total amount of global memory:                 8192 MBytes (8589410304 bytes)
  (046) Multiprocessors, (128) CUDA Cores/MP:    5888 CUDA Cores
  GPU Max Clock rate:                            1485 MHz (1.49 GHz)
  Memory Clock rate:                             7001 Mhz
  Memory Bus Width:                              256-bit
  L2 Cache Size:                                 4194304 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total shared memory per multiprocessor:        102400 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)
  Device supports Unified Addressing (UVA):      Yes
  Device supports Managed Memory:                Yes
  Device supports Compute Preemption:            Yes
  Supports Cooperative Kernel Launch:            Yes
  Supports MultiDevice Co-op Kernel Launch:      No
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 12.3, CUDA Runtime Version = 12.3, NumDevs = 1
Result = PASS

D:\Projects\cuda-samples\bin\win64\Debug>bandwidthTest.exe
[CUDA Bandwidth Test] - Starting...
Running on...

 Device 0: NVIDIA GeForce RTX 3070 Ti Laptop GPU
 Quick Mode

 Host to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(GB/s)
   32000000                     13.4

 Device to Host Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(GB/s)
   32000000                     13.2

 Device to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(GB/s)
   32000000                     390.9

Result = PASS

NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

I suspect the issue is I have reached some sort of accessible limit and that I should use the grid and slide method instead when dealing with such large dataset.

Thank you in advance.

Please post a complete minimal example that can be compiled and executed.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <cuda.h>
#include <memory>
#include <assert.h>
#include "PrimeCheckerLite.h"

// This is for Visual Studio Intellisence to work 
#ifndef __CUDACC__
#define __CUDACC__
#endif // !__CUDACC__

//This is where you can then place the header files for specific CUDA headers
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>

using namespace std;
using namespace cooperative_groups;

#define CUDA_CHECK_RETURN(value) {           \
    cudaError_t _m_cudaStat = value;         \
	std::ostringstream errorMessage;		 \
    if (_m_cudaStat != cudaSuccess) {        \
		 errorMessage <<"Error ["<<cudaGetErrorString(_m_cudaStat)<<"]" \
					  <<" line["<<__LINE__<<"]"<< " file[" << __FILE__ << "]";\
		 throw runtime_error(errorMessage.str()); \
    } \
}



__device__ bool isPrime(long long number)
{
	if (number < 2)
		return false;

	if (number == 2 || number == 3)
		return true;

	//This was missing initially. 
	if (number % 2 == 0 || number % 3 == 0)
		return false;

	if (std::fmod((float)number, (float)2) == 0.0)
		return false;

	long long sqrtNumber = (long long)sqrt((float)number);
	long long dx = 4;

	for (long long i = 5; i <= sqrtNumber; i += dx) {
		if (std::fmod((float)number, (float)i) == 0.0)
			return false;
		dx = -(dx - 6);
	}

	return true;
}

__global__ void kernel(int NumberOfItems,  int* PrimeCountFound,  long long* PrimeStorage) 
{
	__shared__ unsigned int count;

	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	int localID = threadIdx.x;
	bool IsValuePrime = false;

	if (localID == 0)
		count = 0;
	__syncthreads();

	if (idx < NumberOfItems)
	{
		IsValuePrime = isPrime(PrimeStorage[idx]);
	}

	if (IsValuePrime)
	{
		coalesced_group active = coalesced_threads();
		if (active.thread_rank() == 0)
			atomicAdd(&count, active.size());
	}
	else
	{
		PrimeStorage[idx] = 0;
	}

	__syncthreads();

	if (localID == 0)
		PrimeCountFound[blockIdx.x] = count;
}


__global__ void PopulateStorage( long long* Storage,  long long StartValue,  long long NumberOfItems)
{
	int idx = blockIdx.x * blockDim.x + threadIdx.x;

	if (idx < NumberOfItems)
	{
		Storage[idx] = StartValue + idx;
	}	
	__syncthreads();
}

int main()
{
	unsigned long long StartValue= 1;
	int N = 300000000;
	int blockSize=0, gridSize=0;
	int* d_PrimeCount = nullptr;
	long long* d_PrimeStorage = nullptr;
	unique_ptr<int[]> h_PrimeCount;
	unique_ptr<long long[]> h_PrimeStorage;

	cudaOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, (void*)kernel, sizeof(int),0);
	gridSize = ceil(1.0 * N / blockSize); //and just manually calculate it here.
	try
	{

		// allocate as many counters as blocks
		h_PrimeCount = make_unique< int[]>(gridSize);
		CUDA_CHECK_RETURN(cudaMalloc((void**)&d_PrimeCount, gridSize * sizeof(int)));
		CUDA_CHECK_RETURN(cudaMemset(d_PrimeCount,0, gridSize * sizeof(int)));

		//allocate the storage to hold all the number to be checked for Primeality 
		h_PrimeStorage = make_unique<long long[]>(N);
		CUDA_CHECK_RETURN(cudaMalloc((void**)&d_PrimeStorage, N * sizeof(long long)));
		CUDA_CHECK_RETURN(cudaMemset(d_PrimeStorage, 0, N * sizeof(long long)));

		PopulateStorage << <gridSize, blockSize >> > (d_PrimeStorage, StartValue, N);
		CUDA_CHECK_RETURN(cudaDeviceSynchronize()); //Wait to synchronise all the kernel before stopping the measuring

		kernel << <gridSize, blockSize >> > (N, d_PrimeCount, d_PrimeStorage); //call the kernel
		CUDA_CHECK_RETURN(cudaDeviceSynchronize()); //Wait to synchronise all the kernel before stopping the measuring

		//Now copy out the data
		CUDA_CHECK_RETURN(cudaMemcpy(h_PrimeCount.get(), d_PrimeCount, sizeof(int) * gridSize, cudaMemcpyDeviceToHost));
		CUDA_CHECK_RETURN(cudaMemcpy(h_PrimeStorage.get(), d_PrimeStorage, sizeof(long long) * N, cudaMemcpyDeviceToHost));


		// Calculate how many prime numbers it found
		int TotalPrime = 0;
		for (int i = 0; i < N; i++)
		{
			if (h_PrimeStorage[i] > 0)
			{
				TotalPrime++;
			}
		}
		std::cout << TotalPrime << " prime numbers was found." << std::endl;;
	}
	//Any exception
	catch (const runtime_error& e)
	{
		cerr << "Exception " << e.what() << endl;
	}

	//cleanup before exiting.
	if (d_PrimeCount)	
		CUDA_CHECK_RETURN(cudaFree(d_PrimeCount));
	if (d_PrimeStorage)	
		CUDA_CHECK_RETURN(cudaFree(d_PrimeStorage));
	if (start)			
		CUDA_CHECK_RETURN(cudaEventDestroy(start));
	if (stop)			
		CUDA_CHECK_RETURN(cudaEventDestroy(stop));
	CUDA_CHECK_RETURN(cudaDeviceReset());

	return EXIT_SUCCESS;
}

This is the smallest I can make it. The kernel will mark out non-prime numbers by setting its value to zero.

You are casting long long to float. Float does not have the same integer range as long long and will be inexact after around 16,800,000,
If you use double instead of float , your code will print “16252325 prime numbers was found.”

1 Like

Thank you! I previous had an issue using std::fmod and had casted it to float without realizing the implications!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.