Mersenne twister CUDA problem Calculation issue?


I’m working on getting Mersenne twister working in parallel and found a good example of code to use.


 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.


 * Please refer to the NVIDIA end user license agreement (EULA) associated

 * with this source code for terms and conditions that govern your use of

 * this software. Any use, reproduction, disclosure, or distribution of

 * this software and related documentation outside the terms of the EULA

 * is strictly prohibited.



#include "inc/shrUtils.h"

#include "MersenneTwister.h"

#include ""

__device__ static mt_struct_stripped ds_MT[MT_RNG_COUNT];

static mt_struct_stripped h_MT[MT_RNG_COUNT];


//Load twister configurations

void loadMTGPU(const char *fname){

    FILE *fd = fopen(fname, "rb");


        shrLog("initMTGPU(): failed to open %s\n", fname);




    if( !fread(h_MT, sizeof(h_MT), 1, fd) ){

        shrLog("initMTGPU(): failed to load %s\n", fname);







//Initialize/seed twister for current GPU context

void seedMTGPU(unsigned int seed){

    int i;

    //Need to be thread-safe

    mt_struct_stripped *MT = (mt_struct_stripped *)malloc(MT_RNG_COUNT * sizeof(mt_struct_stripped));

for(i = 0; i < MT_RNG_COUNT; i++){

        MT[i]      = h_MT[i];

        MT[i].seed = seed;


    cudaMemcpyToSymbol(ds_MT, MT, sizeof(h_MT));




// Write MT_RNG_COUNT vertical lanes of nPerRng random numbers to *d_Random.

// For coalesced global writes MT_RNG_COUNT should be a multiple of warp size.

// Initial states for each generator are the same, since the states are

// initialized from the global seed. In order to improve distribution properties

// on small NPerRng supply dedicated (local) seed to each twister.

// The local seeds, in their turn, can be extracted from global seed

// by means of any simple random number generator, like LCG.


__global__ void RandomGPU(

    float *d_Random,

    int nPerRng)


    const int      tid = blockDim.x * blockIdx.x + threadIdx.x;

int iState, iState1, iStateM, iOut;

    unsigned int mti, mti1, mtiM, x;

    unsigned int mt[MT_NN], matrix_a, mask_b, mask_c; 

//Load bit-vector Mersenne Twister parameters

    matrix_a = ds_MT[tid].matrix_a;

    mask_b = ds_MT[tid].mask_b;

    mask_c = ds_MT[tid].mask_c;

//Initialize current state

    mt[0] = ds_MT[tid].seed;

    for (iState = 1; iState < MT_NN; iState++)

        mt[iState] = (1812433253U * (mt[iState - 1] ^ (mt[iState - 1] >> 30)) + iState) & MT_WMASK;

iState = 0;

    mti1 = mt[0];

    for (iOut = 0; iOut < nPerRng; iOut++) {

        iState1 = iState + 1;

        iStateM = iState + MT_MM;

        if(iState1 >= MT_NN) iState1 -= MT_NN;

        if(iStateM >= MT_NN) iStateM -= MT_NN;

        mti  = mti1;

        mti1 = mt[iState1];

        mtiM = mt[iStateM];

// MT recurrence

        x    = (mti & MT_UMASK) | (mti1 & MT_LMASK);

        x    =  mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);

mt[iState] = x;

        iState = iState1;

//Tempering transformation

        x ^= (x >> MT_SHIFT0);

        x ^= (x << MT_SHIFTB) & mask_b;

        x ^= (x << MT_SHIFTC) & mask_c;

        x ^= (x >> MT_SHIFT1);

//Convert to (0, 1] float and write to global memory

        d_Random[tid] = ((float)x + 0.5f) * (1.0 / 4294967296.0f);

	cuPrintf("Random value: %d \n", d_Random[tid]);




// Transform each of MT_RNG_COUNT lanes of nPerRng uniformly distributed 

// random samples, produced by RandomGPU(), to normally distributed lanes

// using Cartesian form of Box-Muller transformation.

// nPerRng must be even.


#define PI 3.14159265358979f

__device__ inline void BoxMuller(float& u1, float& u2){

    float   r = sqrtf(-2.0f * logf(u1));

    float phi = 2 * PI * u2;

    u1 = r * __cosf(phi);

    u2 = r * __sinf(phi);


__global__ void BoxMullerGPU(float *d_Random, int nPerRng){

    const int      tid = blockDim.x * blockIdx.x + threadIdx.x;

for (int iOut = 0; iOut < nPerRng; iOut += 2)


                d_Random[tid + (iOut + 0) * MT_RNG_COUNT],

                d_Random[tid + (iOut + 1) * MT_RNG_COUNT]



The calculations are the same as the most recent release of MT, however when added to a simple program to grab random numbers and print them out (using cuPrintf) the numbers that come out are around 1 billion…

//Convert to (0, 1] float and write to global memory

        d_Random[tid] = ((float)x + 0.5f) * (1.0 / 4294967296.0f);

	cuPrintf("Random value: %d \n", d_Random[tid]);

This is the line that “should” convert the large numbers to a (0, 1] orientation. When I take out * (1.0 / 4294967296.0f) and get the original number (1328328124) and multiply it by (1.0 / 4294967296.0f) the number I get is (0.30927…) but when my program runs it comes up with numbers around 784,175,089.

My main function is

#pragma once

#include "cuda_runtime.h"

#include <stdio.h>

#include <conio.h>

#include <cuda.h>

#include <math.h>

#include <ctime>

#include ""

#include <windows.h>

#include ""


using namespace std;

#define blocks		1

#define threads		32

#define numRng		blocks * threads

float *d_random;

float random[numRng];

int main(void){

unsigned int seed = time(0);



cudaMalloc((float**)&d_random, blocks * threads * sizeof(float));


RandomGPU<<<blocks,threads>>>(d_random, numRng);

cudaPrintfDisplay(stdout, true);


printf("Press any key to exit...");



return 0;


Any suggestions? I’m totally baffled by this

I have messed around with the code to be sure that it is getting to the calculations and it is.
I can change the calculation of the section of code to convert to (0, 1] and it reflects changes in the result.

I still can’t seem to get it down far enough to get it anywhere near (0, 1].

Another odd problem is that if I decide to subtract 1,000,000,000 from the answer (which is slightly less than the range) the result is suddenly negative around -831,624,408.

meaning if I would have 1,062,794,147 and subtracted 1,000,000,000 it gives me -831,624,408 instead of something around 62,794,147…
I really have no idea what’s happening here… I suspected somehow I was getting the wrong memory location but I have checked that and I’m sure that I have the right locations.

I’ve the same problem: the generated numbers are between -1.0 and 3.0. I think there is something wrong with the implementation.
Did you figured out the solution?