Mersenne Twister optimization for Monte Carlo method

I’m currently doing for my thesis a CUDA implementation to speed up simulation that are based on the Monte Carlo Method.

The problem with the existing mtgp implementation is that it loads the Mersenne Twister parameters from global memory and thus increasing workload for the global memory that it could be used for data transfers related to the simulation.

Since most of the monte carlo steps do need to communicate back with the host (only when we need to sample the state of the simulation). It is faster to perform multiple steps within a kernel than invoking a kernel for each step.

My idea was to transfer to the fast shared memory the data required for the mtgp and after some source code reading I come up with a tweaked version of the original mtgp implementation by Mutsuo Saito, Makoto Matsumoto and Hiroshima.

The shared version uses 4248 bytes from shared memory to store the data required. At the start of each kernel, it loads the params and state from global memory and at the end stores them back to global memory for consequent kernel invocations.

Below is a sample source code to verify that it produces the same results :

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_runtime_api.h"
#include <cstdio>
#include <cstdlib>
#include "curand_mtgp32.h"
#include "curand_mtgp32_host.h"
#include "curand_mtgp32_kernel.h"
#include <iostream>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
	if (code != cudaSuccess)
	{
		fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
		if (abort) exit(code);
	}
}

#define NBLOCKS 200
#define BLOCKSIZE 256
#define TEST_RUNS 100

__constant__ unsigned int LOOP_COUNT;

struct curandSharedStateMtgp32 {
	unsigned int s[MTGP32_STATE_SIZE];
	int offset;
	int pIdx;
	int pos;
	unsigned int mask;
	int sh1;			/*< shift value 1. 0 < sh1 < 32. */
	int sh2;			/*< shift value 2. 0 < sh2 < 32. */
	unsigned int tbl[16];		/*< a small matrix. */
	unsigned int tmp_tbl[16];	/*< a small matrix for tempering. */
};

__forceinline__ __device__ unsigned int para_rec_shared(curandSharedStateMtgp32 * k, unsigned int X1, unsigned int X2, unsigned int Y) {
	unsigned int X = (X1 & k->mask) ^ X2;
	unsigned int MAT;

	X ^= X << k->sh1;
	Y = X ^ (Y >> k->sh2);
	MAT = k->tbl[Y & 0x0f];
	return Y ^ MAT;
}

__forceinline__ __device__ unsigned int temper_shared(curandSharedStateMtgp32 * k, unsigned int V, unsigned int T) {
	unsigned int MAT;

	T ^= T >> 16;
	T ^= T >> 8;
	MAT = k->tmp_tbl[T & 0x0f];
	return V ^ MAT;
}

__forceinline__ __device__ unsigned int shared_curand(curandSharedStateMtgp32 *state)
{
	unsigned int t;
	unsigned int d;
	unsigned int r;
	unsigned int o;

	d = blockDim.z * blockDim.y * blockDim.x;
	t = (blockDim.z * blockDim.y * threadIdx.z) + (blockDim.x * threadIdx.y) + threadIdx.x;
	r = para_rec_shared(state, state->s[(t + state->offset) & MTGP32_STATE_MASK],
		state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
		state->s[(t + state->offset + state->pos) & MTGP32_STATE_MASK]);

	state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;

	o = temper_shared(state, r, state->s[(t + state->offset + state->pos - 1) & MTGP32_STATE_MASK]);

#if __CUDA_ARCH__ != 0
	__syncthreads();
#endif
	if (t == 0)
	{
		state->offset = (state->offset + d) & MTGP32_STATE_MASK;
	}
#if __CUDA_ARCH__ != 0
	__syncthreads();
#endif
	return o;
}


__global__ void do_steps(curandStateMtgp32_t *state, unsigned int *data )
{
	int thread_data = 0;
	for (int i = 0; i < LOOP_COUNT; ++i) {
		thread_data += curand(&state[blockIdx.x]);
	}
	data[blockIdx.x * blockDim.x + threadIdx.x] += thread_data;
}

__global__ void do_steps_shared(curandStateMtgp32_t *state, unsigned int *data)
{
	__shared__ curandSharedStateMtgp32 state_;
	state += blockIdx.x;
	// MTGP32_STATE_SIZE = 1024 bytes and we have 256 threads (ommiting range check)
#pragma unroll
	for (unsigned short p = 0; p < MTGP32_STATE_SIZE; p += blockDim.x) {
		state_.s[threadIdx.x + p] = state->s[threadIdx.x + p];
	}

	//copy to shared memory data used by the mtgp
	if (threadIdx.x == 0) {
		state_.pIdx = state->pIdx;
		state_.offset = state->offset;
		state_.pos = state->k->pos_tbl[state->pIdx];
		state_.sh1 = state->k->sh1_tbl[state->pIdx];
		state_.sh2 = state->k->sh2_tbl[state->pIdx];
		state_.mask = state->k->mask[0];
	}

	//Copy to shared memory temper tables and param tables TBL_SIZE = 16
	if (threadIdx.x < TBL_SIZE) {
		state_.tbl[threadIdx.x] = state->k->param_tbl[state_.pIdx][threadIdx.x];
		state_.tmp_tbl[threadIdx.x] = state->k->temper_tbl[state_.pIdx][threadIdx.x];
	}
	__syncthreads();
	int thread_data = 0;
	for (int i = 0; i < LOOP_COUNT; ++i) {
		thread_data += shared_curand(&state_);
	}
	data[blockIdx.x * blockDim.x + threadIdx.x] += thread_data;

	//copy back to global memory state and offset
	for (unsigned short p = 0; p < MTGP32_STATE_SIZE; p += blockDim.x) {
			state->s[threadIdx.x + p] = state_.s[threadIdx.x + p];
	}
	if (threadIdx.x == 0) {
		state->offset = state_.offset;
	}
}

void init_mtgp(curandStateMtgp32_t * state, mtgp32_kernel_params_t * params, unsigned int seed = 123)
{
	//Copy cuda kernel params from the pre-generated fast parmas 11213
	curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, params);

	//Initialize the kernel states
	curandMakeMTGP32KernelState(state, mtgp32dc_params_fast_11213, params, NBLOCKS, seed);
}

int main()
{
	unsigned int * ddata1, *ddata2;
	unsigned int * hdata1, *hdata2;
	unsigned int loop_count = 1000;
	const unsigned int size = NBLOCKS * BLOCKSIZE * sizeof(unsigned int);
	curandStateMtgp32_t * d_random_state_;
	mtgp32_kernel_params_t * d_random_params_;


	//allocate data for 200 blocks each with 256 threads for each method tested
	gpuErrchk(cudaMalloc(&ddata1, size));
	gpuErrchk(cudaMemset(ddata1, 0, size));
	gpuErrchk(cudaMalloc(&ddata2, size));
	gpuErrchk(cudaMemset(ddata2, 0, size));
	gpuErrchk(cudaMallocHost(&hdata1, size));
	gpuErrchk(cudaMallocHost(&hdata2, size));

	//Malloc memory for curand kernel params
	gpuErrchk(cudaMalloc(reinterpret_cast<void**>(&d_random_params_), sizeof(mtgp32_kernel_params)));

	//Malloc memory for curand kernel states
	gpuErrchk(cudaMalloc(reinterpret_cast<void**>(&d_random_state_), NBLOCKS * sizeof(curandStateMtgp32)));
	
	//Transfer loop bounds to const symbol
	gpuErrchk(cudaMemcpyToSymbol(LOOP_COUNT, &loop_count, sizeof(int)));

	init_mtgp(d_random_state_, d_random_params_, 123);
	for (int i = 0; i < TEST_RUNS; ++i)
		do_steps<<<NBLOCKS,BLOCKSIZE >>>(d_random_state_, ddata1);

	init_mtgp(d_random_state_, d_random_params_, 123);
	for (int i = 0; i < TEST_RUNS; ++i)
		do_steps_shared << <NBLOCKS, BLOCKSIZE >> >(d_random_state_, ddata2);

	cudaMemcpy(hdata1, ddata1, size, cudaMemcpyDeviceToHost);
	cudaMemcpy(hdata2, ddata2, size, cudaMemcpyDeviceToHost);

	unsigned int comp = 0;
	for (int i = 0; i < NBLOCKS * BLOCKSIZE; ++i) {
		comp += hdata1[i] ^ hdata2[i];
	}
	if (comp == 0) {
		std::cout << "Both kernels returned same random numbers" << std::endl;
	} else {
		std::cerr << "My hacked code is wrong" << std::endl;
	}
	

	cudaFree(ddata1);
	cudaFree(ddata2);
	cudaFreeHost(hdata1);
	cudaFreeHost(hdata2);
	cudaFree(d_random_state_);
	cudaFree(d_random_state_);
}

The performance speed-up I got from my GTX960 is as follows :

==618764== Profiling application: monte_carlo.exe
==618764== Profiling result:
==618764== Metric result:
Invocations                               Metric Name                        Metric Description         Min         Max         Avg
Device "GeForce GTX 960 (0)"
    Kernel: do_steps(curandStateMtgp32*, unsigned int*)
        100                  gld_requested_throughput          Requested Global Load Throughput  106.70GB/s  110.84GB/s  108.55GB/s
        100                    shared_load_throughput             Shared Memory Load Throughput  0.00000B/s  0.00000B/s  0.00000B/s
    Kernel: do_steps_shared(curandStateMtgp32*, unsigned int*)
        100                  gld_requested_throughput          Requested Global Load Throughput  382.80MB/s  388.48MB/s  385.28MB/s
        100                    shared_load_throughput             Shared Memory Load Throughput  806.88GB/s  818.86GB/s  814.08GB/s

==620428== Profiling application: monte_carlo.exe
==620428== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 78.23%  937.09ms       100  9.3709ms  8.8739ms  10.748ms  do_steps(curandStateMtgp32*, unsigned int*)
 21.76%  260.64ms       100  2.6064ms  2.5860ms  2.6519ms  do_steps_shared(curandStateMtgp32*, unsigned int*)
  0.01%  158.05us        17  9.2970us     576ns  72.514us  [CUDA memcpy HtoD]
  0.00%  35.681us         2  17.840us  17.313us  18.368us  [CUDA memcpy DtoH]
  0.00%  11.808us         2  5.9040us  5.1520us  6.6560us  [CUDA memset]

I’m posting this here to help anybody who is currently doing Monte Carlo simulations or wants to run curand() multiple times in a kernel.

If you have any feedback or suggestions I’d more than happy to read them :) .

PS: I removed licences to make the sample code shorter and more readable, my intentions are not to violate any copyrights from NVIDIA, Mutsuo Saito, Makoto Matsumoto and Hiroshima.

Cool idea :)