I’m currently doing for my thesis a CUDA implementation to speed up simulation that are based on the Monte Carlo Method.
The problem with the existing mtgp implementation is that it loads the Mersenne Twister parameters from global memory and thus increasing workload for the global memory that it could be used for data transfers related to the simulation.
Since most of the monte carlo steps do need to communicate back with the host (only when we need to sample the state of the simulation). It is faster to perform multiple steps within a kernel than invoking a kernel for each step.
My idea was to transfer to the fast shared memory the data required for the mtgp and after some source code reading I come up with a tweaked version of the original mtgp implementation by Mutsuo Saito, Makoto Matsumoto and Hiroshima.
The shared version uses 4248 bytes from shared memory to store the data required. At the start of each kernel, it loads the params and state from global memory and at the end stores them back to global memory for consequent kernel invocations.
Below is a sample source code to verify that it produces the same results :
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_runtime_api.h"
#include <cstdio>
#include <cstdlib>
#include "curand_mtgp32.h"
#include "curand_mtgp32_host.h"
#include "curand_mtgp32_kernel.h"
#include <iostream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define NBLOCKS 200
#define BLOCKSIZE 256
#define TEST_RUNS 100
__constant__ unsigned int LOOP_COUNT;
struct curandSharedStateMtgp32 {
unsigned int s[MTGP32_STATE_SIZE];
int offset;
int pIdx;
int pos;
unsigned int mask;
int sh1; /*< shift value 1. 0 < sh1 < 32. */
int sh2; /*< shift value 2. 0 < sh2 < 32. */
unsigned int tbl[16]; /*< a small matrix. */
unsigned int tmp_tbl[16]; /*< a small matrix for tempering. */
};
__forceinline__ __device__ unsigned int para_rec_shared(curandSharedStateMtgp32 * k, unsigned int X1, unsigned int X2, unsigned int Y) {
unsigned int X = (X1 & k->mask) ^ X2;
unsigned int MAT;
X ^= X << k->sh1;
Y = X ^ (Y >> k->sh2);
MAT = k->tbl[Y & 0x0f];
return Y ^ MAT;
}
__forceinline__ __device__ unsigned int temper_shared(curandSharedStateMtgp32 * k, unsigned int V, unsigned int T) {
unsigned int MAT;
T ^= T >> 16;
T ^= T >> 8;
MAT = k->tmp_tbl[T & 0x0f];
return V ^ MAT;
}
__forceinline__ __device__ unsigned int shared_curand(curandSharedStateMtgp32 *state)
{
unsigned int t;
unsigned int d;
unsigned int r;
unsigned int o;
d = blockDim.z * blockDim.y * blockDim.x;
t = (blockDim.z * blockDim.y * threadIdx.z) + (blockDim.x * threadIdx.y) + threadIdx.x;
r = para_rec_shared(state, state->s[(t + state->offset) & MTGP32_STATE_MASK],
state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
state->s[(t + state->offset + state->pos) & MTGP32_STATE_MASK]);
state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
o = temper_shared(state, r, state->s[(t + state->offset + state->pos - 1) & MTGP32_STATE_MASK]);
#if __CUDA_ARCH__ != 0
__syncthreads();
#endif
if (t == 0)
{
state->offset = (state->offset + d) & MTGP32_STATE_MASK;
}
#if __CUDA_ARCH__ != 0
__syncthreads();
#endif
return o;
}
__global__ void do_steps(curandStateMtgp32_t *state, unsigned int *data )
{
int thread_data = 0;
for (int i = 0; i < LOOP_COUNT; ++i) {
thread_data += curand(&state[blockIdx.x]);
}
data[blockIdx.x * blockDim.x + threadIdx.x] += thread_data;
}
__global__ void do_steps_shared(curandStateMtgp32_t *state, unsigned int *data)
{
__shared__ curandSharedStateMtgp32 state_;
state += blockIdx.x;
// MTGP32_STATE_SIZE = 1024 bytes and we have 256 threads (ommiting range check)
#pragma unroll
for (unsigned short p = 0; p < MTGP32_STATE_SIZE; p += blockDim.x) {
state_.s[threadIdx.x + p] = state->s[threadIdx.x + p];
}
//copy to shared memory data used by the mtgp
if (threadIdx.x == 0) {
state_.pIdx = state->pIdx;
state_.offset = state->offset;
state_.pos = state->k->pos_tbl[state->pIdx];
state_.sh1 = state->k->sh1_tbl[state->pIdx];
state_.sh2 = state->k->sh2_tbl[state->pIdx];
state_.mask = state->k->mask[0];
}
//Copy to shared memory temper tables and param tables TBL_SIZE = 16
if (threadIdx.x < TBL_SIZE) {
state_.tbl[threadIdx.x] = state->k->param_tbl[state_.pIdx][threadIdx.x];
state_.tmp_tbl[threadIdx.x] = state->k->temper_tbl[state_.pIdx][threadIdx.x];
}
__syncthreads();
int thread_data = 0;
for (int i = 0; i < LOOP_COUNT; ++i) {
thread_data += shared_curand(&state_);
}
data[blockIdx.x * blockDim.x + threadIdx.x] += thread_data;
//copy back to global memory state and offset
for (unsigned short p = 0; p < MTGP32_STATE_SIZE; p += blockDim.x) {
state->s[threadIdx.x + p] = state_.s[threadIdx.x + p];
}
if (threadIdx.x == 0) {
state->offset = state_.offset;
}
}
void init_mtgp(curandStateMtgp32_t * state, mtgp32_kernel_params_t * params, unsigned int seed = 123)
{
//Copy cuda kernel params from the pre-generated fast parmas 11213
curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, params);
//Initialize the kernel states
curandMakeMTGP32KernelState(state, mtgp32dc_params_fast_11213, params, NBLOCKS, seed);
}
int main()
{
unsigned int * ddata1, *ddata2;
unsigned int * hdata1, *hdata2;
unsigned int loop_count = 1000;
const unsigned int size = NBLOCKS * BLOCKSIZE * sizeof(unsigned int);
curandStateMtgp32_t * d_random_state_;
mtgp32_kernel_params_t * d_random_params_;
//allocate data for 200 blocks each with 256 threads for each method tested
gpuErrchk(cudaMalloc(&ddata1, size));
gpuErrchk(cudaMemset(ddata1, 0, size));
gpuErrchk(cudaMalloc(&ddata2, size));
gpuErrchk(cudaMemset(ddata2, 0, size));
gpuErrchk(cudaMallocHost(&hdata1, size));
gpuErrchk(cudaMallocHost(&hdata2, size));
//Malloc memory for curand kernel params
gpuErrchk(cudaMalloc(reinterpret_cast<void**>(&d_random_params_), sizeof(mtgp32_kernel_params)));
//Malloc memory for curand kernel states
gpuErrchk(cudaMalloc(reinterpret_cast<void**>(&d_random_state_), NBLOCKS * sizeof(curandStateMtgp32)));
//Transfer loop bounds to const symbol
gpuErrchk(cudaMemcpyToSymbol(LOOP_COUNT, &loop_count, sizeof(int)));
init_mtgp(d_random_state_, d_random_params_, 123);
for (int i = 0; i < TEST_RUNS; ++i)
do_steps<<<NBLOCKS,BLOCKSIZE >>>(d_random_state_, ddata1);
init_mtgp(d_random_state_, d_random_params_, 123);
for (int i = 0; i < TEST_RUNS; ++i)
do_steps_shared << <NBLOCKS, BLOCKSIZE >> >(d_random_state_, ddata2);
cudaMemcpy(hdata1, ddata1, size, cudaMemcpyDeviceToHost);
cudaMemcpy(hdata2, ddata2, size, cudaMemcpyDeviceToHost);
unsigned int comp = 0;
for (int i = 0; i < NBLOCKS * BLOCKSIZE; ++i) {
comp += hdata1[i] ^ hdata2[i];
}
if (comp == 0) {
std::cout << "Both kernels returned same random numbers" << std::endl;
} else {
std::cerr << "My hacked code is wrong" << std::endl;
}
cudaFree(ddata1);
cudaFree(ddata2);
cudaFreeHost(hdata1);
cudaFreeHost(hdata2);
cudaFree(d_random_state_);
cudaFree(d_random_state_);
}
The performance speed-up I got from my GTX960 is as follows :
==618764== Profiling application: monte_carlo.exe
==618764== Profiling result:
==618764== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GeForce GTX 960 (0)"
Kernel: do_steps(curandStateMtgp32*, unsigned int*)
100 gld_requested_throughput Requested Global Load Throughput 106.70GB/s 110.84GB/s 108.55GB/s
100 shared_load_throughput Shared Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Kernel: do_steps_shared(curandStateMtgp32*, unsigned int*)
100 gld_requested_throughput Requested Global Load Throughput 382.80MB/s 388.48MB/s 385.28MB/s
100 shared_load_throughput Shared Memory Load Throughput 806.88GB/s 818.86GB/s 814.08GB/s
==620428== Profiling application: monte_carlo.exe
==620428== Profiling result:
Time(%) Time Calls Avg Min Max Name
78.23% 937.09ms 100 9.3709ms 8.8739ms 10.748ms do_steps(curandStateMtgp32*, unsigned int*)
21.76% 260.64ms 100 2.6064ms 2.5860ms 2.6519ms do_steps_shared(curandStateMtgp32*, unsigned int*)
0.01% 158.05us 17 9.2970us 576ns 72.514us [CUDA memcpy HtoD]
0.00% 35.681us 2 17.840us 17.313us 18.368us [CUDA memcpy DtoH]
0.00% 11.808us 2 5.9040us 5.1520us 6.6560us [CUDA memset]
I’m posting this here to help anybody who is currently doing Monte Carlo simulations or wants to run curand() multiple times in a kernel.
If you have any feedback or suggestions I’d more than happy to read them :) .
PS: I removed licences to make the sample code shorter and more readable, my intentions are not to violate any copyrights from NVIDIA, Mutsuo Saito, Makoto Matsumoto and Hiroshima.