FROL
July 17, 2013, 12:15pm
1
Hi! I’m a bit confused about difference of integral precition with these 2 SDK samples.
Both samples seems to have the same parameter set and 100000 iterations, but the result is quite different:
MC_EstimatePiQ:
Absolute Error = 2.47e-4
Relative Error = 7.87e-5
MC_EstimatePiInlineQ:
Absolute Error = 1.44e-3
Relative Error = 4.6e-4
The difference is huge. Is it ok? Is there a way to get higher precition for ‘Inline’ sample?
Thanks!
FROL
July 17, 2013, 5:32pm
3
It seems some-thing wrong with MC_EstimatePiInlineQ sample itself. I tried to replace random numbers generation in the ‘MC_EstimatePiQ’ with the device-style of ‘MC_EstimatePiInlineQ’ and got old error values:
Absolute Error = 2.47e-4
Relative Error = 7.87e-5
__global__ void initRNG(curandStateSobol32_t* const rngStates, curandDirectionVectors32_t* const rngDirections, unsigned int step)
{
// Determine thread ID
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Initialise the RNG
curand_init(rngDirections[0], tid, &rngStates[tid]);
curand_init(rngDirections[1], tid, &rngStates[tid + step]);
}
__device__ inline void getPoint(float &x, float &y, curandStateSobol32* state1, curandStateSobol32* state2)
{
x = curand_uniform(state1);
y = curand_uniform(state2);
}
__global__ void generateRandsByKernel(float *points, curandStateSobol32* rngStates, unsigned int step)
{
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Initialise the RNG
//
curandStateSobol32 localState1 = rngStates[tid];
curandStateSobol32 localState2 = rngStates[tid + step];
float x,y;
getPoint(x, y, &localState1, &localState2);
points[tid] = x;
points[tid+step] = y;
}
// ......
curandDirectionVectors32_t *rngDirections;
curandGetDirectionVectors32(&rngDirections, CURAND_DIRECTION_VECTORS_32_JOEKUO6);
curandDirectionVectors32_t* d_rngDirections;
cudaMalloc((void **)&d_rngDirections, 2 * sizeof(curandDirectionVectors32_t));
cudaMemcpy(d_rngDirections, rngDirections, 2 * sizeof(curandDirectionVectors32_t), cudaMemcpyHostToDevice);
curandStateSobol32_t* d_states;
cudaMalloc((void **)&d_states, 2 * m_numSims * sizeof(curandStateSobol32_t));
initRNG<<<grid, block>>>(d_states, d_rngDirections, m_numSims);
generateRandsByKernel<<<grid, block>>>(d_points, d_states, m_numSims);
cudaFree(d_rngDirections);
cudaFree(d_states);