Hi,
I am trying to improve the performance of my application by caching the curand setup (as this is the most time consuming part of Monte Carlo for my purposes).
My curand setup routine is:
__global__ void setup_kernel(CURAND_GENERATOR_TYPE * state, Model** model, unsigned long seed)
{
const unsigned tid = threadIdx.x;
const unsigned bid = blockIdx.x;
const unsigned bsz = blockDim.x;
int index = tid + bid * bsz;
long offset = (*(*model)->offset)(index);
curand_init(seed, index, offset, &state[index]);
}
My idea is to simply cache the CURAND_GENERATOR_TYPE* data as follow:
std::string curandFileName = std::string("C:\\temp\\curand_") + std::to_string(BLOCK_SIZE_CURAND * NUM_THREADS_PER_BLOCK) + std::string(".dat");
if (FILE * file = fopen(curandFileName.c_str(), "r"))
{
CURAND_GENERATOR_TYPE* hostState = (CURAND_GENERATOR_TYPE*) malloc(BLOCK_SIZE_CURAND * NUM_THREADS_PER_BLOCK * sizeof(CURAND_GENERATOR_TYPE));
fread(hostState, sizeof(CURAND_GENERATOR_TYPE), BLOCK_SIZE_CURAND * NUM_THREADS_PER_BLOCK, file);
cudaError_t errr = cudaMemcpy(devState, hostState, BLOCK_SIZE_CURAND * NUM_THREADS_PER_BLOCK * sizeof(CURAND_GENERATOR_TYPE), cudaMemcpyHostToDevice);
fclose(file);
}
else
{
setup_kernel <<< BLOCK_SIZE_CURAND, NUM_THREADS_PER_BLOCK >>> (devState, dModel, seed);
CURAND_GENERATOR_TYPE* hostState = (CURAND_GENERATOR_TYPE*)malloc(BLOCK_SIZE_CURAND * NUM_THREADS_PER_BLOCK * sizeof(CURAND_GENERATOR_TYPE));
cudaError_t errr = cudaMemcpy(hostState, devState, BLOCK_SIZE_CURAND * NUM_THREADS_PER_BLOCK * sizeof(CURAND_GENERATOR_TYPE), cudaMemcpyDeviceToHost);
file = fopen(curandFileName.c_str(), "w");
fwrite(hostState, sizeof(CURAND_GENERATOR_TYPE), BLOCK_SIZE_CURAND * NUM_THREADS_PER_BLOCK, file);
fclose(file);
}
At first sight this seemed to be working well. However, on closer inspection I realised the Monte Carlo results are not quite the same - I get some convergence error on the cached version (about 3% from the correct result when I don’t cache). I do not get any errors from cuda which makes me wonder if I have made a counting error on the size to save to disk or something. Any ideas? It’s quite hard to extract a minimal working example from my application, but I can do this if it is not obvious what I have done wrong.
Many thanks in advance.