Hello
I comput primes numbers with cuda :
__global__ void PrimeNumberV2( bool *tab, unsigned long long Nstart, unsigned long long Ndelta)
{
unsigned long long index = blockIdx.x * blockDim.x + threadIdx.x;
while (index <= Ndelta)
{
unsigned long long N = (index + Nstart)*2+1;
unsigned long long c = 0;
for (unsigned long long i = 1; i < (index + Nstart); i++)
{
c = i * 2 + 1;
if (N%c == 0)
{
tab[index] = false;
break;
}
}
//index = Ndelta + 1;
index +=blockDim.x * gridDim.x;
}
}
I use Unified Memory :
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <chrono> //Keep track of time
#include <fstream>
using namespace std::chrono;
std::ofstream fileOut_Benchmark;
__device__ __managed__ bool *Tab;
Code CPU host :
unsigned long long Udiff = (end - start)/2;
//bool *Tab = NULL;
cudaMallocManaged(&Tab, Udiff * sizeof(bool));
cudaDeviceSynchronize();
initTab(Tab, Udiff);
cudaDeviceSynchronize();
int NbThreadPerBlock = 1024;
int NbBlockPerGrid = 512;
PrimeNumberV2 << <NbBlockPerGrid, NbThreadPerBlock >> > (Tab, startN, Udiff);
cudaDeviceSynchronize();
count = countTab(Tab, Udiff, startN, mode);
cudaFree(Tab);
Problem, a size of Tab is limited at 600 000 elements and I want have Tab with 20 000 000 000 elements.
I propose to use a RAM of my PC (32 Go) for to stock Tab, but how do I do that?