Here is a simple program
#include <stdio.h>
#include <cuda.h>
#include <curand_kernel.h>
#include <unistd.h>
global void
func(void){
int id = blockIdx.x * blockDim.x + threadIdx.x;
curandState s;
curand_init(0,id,0,&s);
for(int i=0;i<4;i++){
printf(“%d %f\n”,id,curand_uniform_double(&s));
}
}
int main(void){
cudaSetDeviceFlags(cudaDeviceMapHost);
int const n_thread = 1;
int const n_block = 1;
size_t start_mem, end_mem, total_mem;
cudaMemGetInfo(&start_mem, &total_mem);
func<<<n_block,n_thread>>>();
printf(“Version = %d\n”,CUDA_VERSION);
cudaThreadSynchronize();
cudaMemGetInfo(&end_mem, &total_mem);
printf(“used memory = %ld MB\n”, (start_mem - end_mem)/1048576);
return 0;
}
Just one call of curand_init is taking 298 MB of GPU memory in Titan X pascal and 267 MB memory in Titan X maxwell. This is a huge bottleneck in our application as we are running multiple instances of a process which calls curand_init. What is the reason curand_init is allocating GPU memory?