Hey,
I’m working on getting Mersenne twister working in parallel and found a good example of code to use.
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include "inc/shrUtils.h"
#include "MersenneTwister.h"
#include "cuPrintf.cu"
__device__ static mt_struct_stripped ds_MT[MT_RNG_COUNT];
static mt_struct_stripped h_MT[MT_RNG_COUNT];
/*
//Load twister configurations
void loadMTGPU(const char *fname){
FILE *fd = fopen(fname, "rb");
if(!fd){
shrLog("initMTGPU(): failed to open %s\n", fname);
shrLog("FAILED\n");
exit(0);
}
if( !fread(h_MT, sizeof(h_MT), 1, fd) ){
shrLog("initMTGPU(): failed to load %s\n", fname);
shrLog("FAILED\n");
exit(0);
}
fclose(fd);
}
*/
//Initialize/seed twister for current GPU context
void seedMTGPU(unsigned int seed){
int i;
//Need to be thread-safe
mt_struct_stripped *MT = (mt_struct_stripped *)malloc(MT_RNG_COUNT * sizeof(mt_struct_stripped));
for(i = 0; i < MT_RNG_COUNT; i++){
MT[i] = h_MT[i];
MT[i].seed = seed;
}
cudaMemcpyToSymbol(ds_MT, MT, sizeof(h_MT));
free(MT);
}
////////////////////////////////////////////////////////////////////////////////
// Write MT_RNG_COUNT vertical lanes of nPerRng random numbers to *d_Random.
// For coalesced global writes MT_RNG_COUNT should be a multiple of warp size.
// Initial states for each generator are the same, since the states are
// initialized from the global seed. In order to improve distribution properties
// on small NPerRng supply dedicated (local) seed to each twister.
// The local seeds, in their turn, can be extracted from global seed
// by means of any simple random number generator, like LCG.
////////////////////////////////////////////////////////////////////////////////
__global__ void RandomGPU(
float *d_Random,
int nPerRng)
{
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
int iState, iState1, iStateM, iOut;
unsigned int mti, mti1, mtiM, x;
unsigned int mt[MT_NN], matrix_a, mask_b, mask_c;
//Load bit-vector Mersenne Twister parameters
matrix_a = ds_MT[tid].matrix_a;
mask_b = ds_MT[tid].mask_b;
mask_c = ds_MT[tid].mask_c;
//Initialize current state
mt[0] = ds_MT[tid].seed;
for (iState = 1; iState < MT_NN; iState++)
mt[iState] = (1812433253U * (mt[iState - 1] ^ (mt[iState - 1] >> 30)) + iState) & MT_WMASK;
iState = 0;
mti1 = mt[0];
for (iOut = 0; iOut < nPerRng; iOut++) {
iState1 = iState + 1;
iStateM = iState + MT_MM;
if(iState1 >= MT_NN) iState1 -= MT_NN;
if(iStateM >= MT_NN) iStateM -= MT_NN;
mti = mti1;
mti1 = mt[iState1];
mtiM = mt[iStateM];
// MT recurrence
x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);
mt[iState] = x;
iState = iState1;
//Tempering transformation
x ^= (x >> MT_SHIFT0);
x ^= (x << MT_SHIFTB) & mask_b;
x ^= (x << MT_SHIFTC) & mask_c;
x ^= (x >> MT_SHIFT1);
//Convert to (0, 1] float and write to global memory
d_Random[tid] = ((float)x + 0.5f) * (1.0 / 4294967296.0f);
cuPrintf("Random value: %d \n", d_Random[tid]);
}
}
////////////////////////////////////////////////////////////////////////////////
// Transform each of MT_RNG_COUNT lanes of nPerRng uniformly distributed
// random samples, produced by RandomGPU(), to normally distributed lanes
// using Cartesian form of Box-Muller transformation.
// nPerRng must be even.
////////////////////////////////////////////////////////////////////////////////
#define PI 3.14159265358979f
__device__ inline void BoxMuller(float& u1, float& u2){
float r = sqrtf(-2.0f * logf(u1));
float phi = 2 * PI * u2;
u1 = r * __cosf(phi);
u2 = r * __sinf(phi);
}
__global__ void BoxMullerGPU(float *d_Random, int nPerRng){
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int iOut = 0; iOut < nPerRng; iOut += 2)
BoxMuller(
d_Random[tid + (iOut + 0) * MT_RNG_COUNT],
d_Random[tid + (iOut + 1) * MT_RNG_COUNT]
);
}
The calculations are the same as the most recent release of MT, however when added to a simple program to grab random numbers and print them out (using cuPrintf) the numbers that come out are around 1 billion…
//Convert to (0, 1] float and write to global memory
d_Random[tid] = ((float)x + 0.5f) * (1.0 / 4294967296.0f);
cuPrintf("Random value: %d \n", d_Random[tid]);
This is the line that “should” convert the large numbers to a (0, 1] orientation. When I take out * (1.0 / 4294967296.0f) and get the original number (1328328124) and multiply it by (1.0 / 4294967296.0f) the number I get is (0.30927…) but when my program runs it comes up with numbers around 784,175,089.
My main function is
#pragma once
#include "cuda_runtime.h"
#include <stdio.h>
#include <conio.h>
#include <cuda.h>
#include <math.h>
#include <ctime>
#include "cuPrintf.cu"
#include <windows.h>
#include "MersenneTwister_kernelTest.cu"
#define __CUDA_INTERNAL_COMPILATION__
using namespace std;
#define blocks 1
#define threads 32
#define numRng blocks * threads
float *d_random;
float random[numRng];
int main(void){
unsigned int seed = time(0);
seedMTGPU(seed);
cudaSetDevice(0);
cudaMalloc((float**)&d_random, blocks * threads * sizeof(float));
cudaPrintfInit();
RandomGPU<<<blocks,threads>>>(d_random, numRng);
cudaPrintfDisplay(stdout, true);
cudaPrintfEnd();
printf("Press any key to exit...");
getch();
cudaFree(d_random);
return 0;
}
Any suggestions? I’m totally baffled by this